Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / vp8dsp.asm @ 003243c3

History | View | Annotate | Download (72.8 KB)

1
;******************************************************************************
2
;* VP8 MMXEXT optimizations
3
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5
;*
6
;* This file is part of FFmpeg.
7
;*
8
;* FFmpeg is free software; you can redistribute it and/or
9
;* modify it under the terms of the GNU Lesser General Public
10
;* License as published by the Free Software Foundation; either
11
;* version 2.1 of the License, or (at your option) any later version.
12
;*
13
;* FFmpeg is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
;* Lesser General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU Lesser General Public
19
;* License along with FFmpeg; if not, write to the Free Software
20
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
;******************************************************************************
22

    
23
%include "x86inc.asm"
24
%include "x86util.asm"
25

    
26
SECTION_RODATA
27

    
28
fourtap_filter_hw_m: times 4 dw  -6, 123
29
                     times 4 dw  12,  -1
30
                     times 4 dw  -9,  93
31
                     times 4 dw  50,  -6
32
                     times 4 dw  -6,  50
33
                     times 4 dw  93,  -9
34
                     times 4 dw  -1,  12
35
                     times 4 dw 123,  -6
36

    
37
sixtap_filter_hw_m:  times 4 dw   2, -11
38
                     times 4 dw 108,  36
39
                     times 4 dw  -8,   1
40
                     times 4 dw   3, -16
41
                     times 4 dw  77,  77
42
                     times 4 dw -16,   3
43
                     times 4 dw   1,  -8
44
                     times 4 dw  36, 108
45
                     times 4 dw -11,   2
46

    
47
fourtap_filter_hb_m: times 8 db  -6, 123
48
                     times 8 db  12,  -1
49
                     times 8 db  -9,  93
50
                     times 8 db  50,  -6
51
                     times 8 db  -6,  50
52
                     times 8 db  93,  -9
53
                     times 8 db  -1,  12
54
                     times 8 db 123,  -6
55

    
56
sixtap_filter_hb_m:  times 8 db   2,   1
57
                     times 8 db -11, 108
58
                     times 8 db  36,  -8
59
                     times 8 db   3,   3
60
                     times 8 db -16,  77
61
                     times 8 db  77, -16
62
                     times 8 db   1,   2
63
                     times 8 db  -8,  36
64
                     times 8 db 108, -11
65

    
66
fourtap_filter_v_m:  times 8 dw  -6
67
                     times 8 dw 123
68
                     times 8 dw  12
69
                     times 8 dw  -1
70
                     times 8 dw  -9
71
                     times 8 dw  93
72
                     times 8 dw  50
73
                     times 8 dw  -6
74
                     times 8 dw  -6
75
                     times 8 dw  50
76
                     times 8 dw  93
77
                     times 8 dw  -9
78
                     times 8 dw  -1
79
                     times 8 dw  12
80
                     times 8 dw 123
81
                     times 8 dw  -6
82

    
83
sixtap_filter_v_m:   times 8 dw   2
84
                     times 8 dw -11
85
                     times 8 dw 108
86
                     times 8 dw  36
87
                     times 8 dw  -8
88
                     times 8 dw   1
89
                     times 8 dw   3
90
                     times 8 dw -16
91
                     times 8 dw  77
92
                     times 8 dw  77
93
                     times 8 dw -16
94
                     times 8 dw   3
95
                     times 8 dw   1
96
                     times 8 dw  -8
97
                     times 8 dw  36
98
                     times 8 dw 108
99
                     times 8 dw -11
100
                     times 8 dw   2
101

    
102
bilinear_filter_vw_m: times 8 dw 1
103
                      times 8 dw 2
104
                      times 8 dw 3
105
                      times 8 dw 4
106
                      times 8 dw 5
107
                      times 8 dw 6
108
                      times 8 dw 7
109

    
110
bilinear_filter_vb_m: times 8 db 7, 1
111
                      times 8 db 6, 2
112
                      times 8 db 5, 3
113
                      times 8 db 4, 4
114
                      times 8 db 3, 5
115
                      times 8 db 2, 6
116
                      times 8 db 1, 7
117

    
118
%ifdef PIC
119
%define fourtap_filter_hw    r11
120
%define sixtap_filter_hw     r11
121
%define fourtap_filter_hb    r11
122
%define sixtap_filter_hb     r11
123
%define fourtap_filter_v     r11
124
%define sixtap_filter_v      r11
125
%define bilinear_filter_vw   r11
126
%define bilinear_filter_vb   r11
127
%else
128
%define fourtap_filter_hw fourtap_filter_hw_m
129
%define sixtap_filter_hw  sixtap_filter_hw_m
130
%define fourtap_filter_hb fourtap_filter_hb_m
131
%define sixtap_filter_hb  sixtap_filter_hb_m
132
%define fourtap_filter_v  fourtap_filter_v_m
133
%define sixtap_filter_v   sixtap_filter_v_m
134
%define bilinear_filter_vw bilinear_filter_vw_m
135
%define bilinear_filter_vb bilinear_filter_vb_m
136
%endif
137

    
138
filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
139
filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
140

    
141
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
142
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
143
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
144

    
145
pw_20091: times 4 dw 20091
146
pw_17734: times 4 dw 17734
147

    
148
cextern pb_1
149
cextern pw_3
150
cextern pb_3
151
cextern pw_4
152
cextern pb_4
153
cextern pw_9
154
cextern pw_18
155
cextern pw_27
156
cextern pw_63
157
cextern pw_64
158
cextern pb_80
159
cextern pb_F8
160
cextern pb_FE
161

    
162
SECTION .text
163

    
164
;-----------------------------------------------------------------------------
165
; subpel MC functions:
166
;
167
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
168
;                                              uint8_t *src, int srcstride,
169
;                                              int height,   int mx, int my);
170
;-----------------------------------------------------------------------------
171

    
172
%macro FILTER_SSSE3 3
173
cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
174
    lea      r5d, [r5*3]
175
    mova      m3, [filter_h6_shuf2]
176
    mova      m4, [filter_h6_shuf3]
177
%ifdef PIC
178
    lea      r11, [sixtap_filter_hb_m]
179
%endif
180
    mova      m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
181
    mova      m6, [sixtap_filter_hb+r5*8-32]
182
    mova      m7, [sixtap_filter_hb+r5*8-16]
183

    
184
.nextrow
185
    movu      m0, [r2-2]
186
    mova      m1, m0
187
    mova      m2, m0
188
%ifidn %1, 4
189
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
190
; shuffle with a memory operand
191
    punpcklbw m0, [r2+3]
192
%else
193
    pshufb    m0, [filter_h6_shuf1]
194
%endif
195
    pshufb    m1, m3
196
    pshufb    m2, m4
197
    pmaddubsw m0, m5
198
    pmaddubsw m1, m6
199
    pmaddubsw m2, m7
200
    paddsw    m0, m1
201
    paddsw    m0, m2
202
    paddsw    m0, [pw_64]
203
    psraw     m0, 7
204
    packuswb  m0, m0
205
    movh    [r0], m0        ; store
206

    
207
    ; go to next line
208
    add       r0, r1
209
    add       r2, r3
210
    dec       r4            ; next row
211
    jg .nextrow
212
    REP_RET
213

    
214
cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
215
    shl      r5d, 4
216
    mova      m2, [pw_64]
217
    mova      m3, [filter_h2_shuf]
218
    mova      m4, [filter_h4_shuf]
219
%ifdef PIC
220
    lea      r11, [fourtap_filter_hb_m]
221
%endif
222
    mova      m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
223
    mova      m6, [fourtap_filter_hb+r5]
224

    
225
.nextrow
226
    movu      m0, [r2-1]
227
    mova      m1, m0
228
    pshufb    m0, m3
229
    pshufb    m1, m4
230
    pmaddubsw m0, m5
231
    pmaddubsw m1, m6
232
    paddsw    m0, m2
233
    paddsw    m0, m1
234
    psraw     m0, 7
235
    packuswb  m0, m0
236
    movh    [r0], m0        ; store
237

    
238
    ; go to next line
239
    add       r0, r1
240
    add       r2, r3
241
    dec       r4            ; next row
242
    jg .nextrow
243
    REP_RET
244

    
245
cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
246
    shl      r6d, 4
247
%ifdef PIC
248
    lea      r11, [fourtap_filter_hb_m]
249
%endif
250
    mova      m5, [fourtap_filter_hb+r6-16]
251
    mova      m6, [fourtap_filter_hb+r6]
252
    mova      m7, [pw_64]
253

    
254
    ; read 3 lines
255
    sub       r2, r3
256
    movh      m0, [r2]
257
    movh      m1, [r2+  r3]
258
    movh      m2, [r2+2*r3]
259
    add       r2, r3
260

    
261
.nextrow
262
    movh      m3, [r2+2*r3]                ; read new row
263
    mova      m4, m0
264
    mova      m0, m1
265
    punpcklbw m4, m1
266
    mova      m1, m2
267
    punpcklbw m2, m3
268
    pmaddubsw m4, m5
269
    pmaddubsw m2, m6
270
    paddsw    m4, m2
271
    mova      m2, m3
272
    paddsw    m4, m7
273
    psraw     m4, 7
274
    packuswb  m4, m4
275
    movh    [r0], m4
276

    
277
    ; go to next line
278
    add        r0, r1
279
    add        r2, r3
280
    dec        r4                          ; next row
281
    jg .nextrow
282
    REP_RET
283

    
284
cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
285
    lea      r6d, [r6*3]
286
%ifdef PIC
287
    lea      r11, [sixtap_filter_hb_m]
288
%endif
289
    lea       r6, [sixtap_filter_hb+r6*8]
290

    
291
    ; read 5 lines
292
    sub       r2, r3
293
    sub       r2, r3
294
    movh      m0, [r2]
295
    movh      m1, [r2+r3]
296
    movh      m2, [r2+r3*2]
297
    lea       r2, [r2+r3*2]
298
    add       r2, r3
299
    movh      m3, [r2]
300
    movh      m4, [r2+r3]
301

    
302
.nextrow
303
    movh      m5, [r2+2*r3]                ; read new row
304
    mova      m6, m0
305
    punpcklbw m6, m5
306
    mova      m0, m1
307
    punpcklbw m1, m2
308
    mova      m7, m3
309
    punpcklbw m7, m4
310
    pmaddubsw m6, [r6-48]
311
    pmaddubsw m1, [r6-32]
312
    pmaddubsw m7, [r6-16]
313
    paddsw    m6, m1
314
    paddsw    m6, m7
315
    mova      m1, m2
316
    paddsw    m6, [pw_64]
317
    mova      m2, m3
318
    psraw     m6, 7
319
    mova      m3, m4
320
    packuswb  m6, m6
321
    mova      m4, m5
322
    movh    [r0], m6
323

    
324
    ; go to next line
325
    add        r0, r1
326
    add        r2, r3
327
    dec        r4                          ; next row
328
    jg .nextrow
329
    REP_RET
330
%endmacro
331

    
332
INIT_MMX
333
FILTER_SSSE3 4, 0, 0
334
INIT_XMM
335
FILTER_SSSE3 8, 8, 7
336

    
337
; 4x4 block, H-only 4-tap filter
338
cglobal put_vp8_epel4_h4_mmxext, 6, 6
339
    shl       r5d, 4
340
%ifdef PIC
341
    lea       r11, [fourtap_filter_hw_m]
342
%endif
343
    movq      mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
344
    movq      mm5, [fourtap_filter_hw+r5]
345
    movq      mm7, [pw_64]
346
    pxor      mm6, mm6
347

    
348
.nextrow
349
    movq      mm1, [r2-1]                  ; (ABCDEFGH) load 8 horizontal pixels
350

    
351
    ; first set of 2 pixels
352
    movq      mm2, mm1                     ; byte ABCD..
353
    punpcklbw mm1, mm6                     ; byte->word ABCD
354
    pshufw    mm0, mm2, 9                  ; byte CDEF..
355
    punpcklbw mm0, mm6                     ; byte->word CDEF
356
    pshufw    mm3, mm1, 0x94               ; word ABBC
357
    pshufw    mm1, mm0, 0x94               ; word CDDE
358
    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
359
    movq      mm0, mm1                     ; backup for second set of pixels
360
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
361
    paddd     mm3, mm1                     ; finish 1st 2px
362

    
363
    ; second set of 2 pixels, use backup of above
364
    punpckhbw mm2, mm6                     ; byte->word EFGH
365
    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
366
    pshufw    mm1, mm2, 0x94               ; word EFFG
367
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
368
    paddd     mm0, mm1                     ; finish 2nd 2px
369

    
370
    ; merge two sets of 2 pixels into one set of 4, round/clip/store
371
    packssdw  mm3, mm0                     ; merge dword->word (4px)
372
    paddsw    mm3, mm7                     ; rounding
373
    psraw     mm3, 7
374
    packuswb  mm3, mm6                     ; clip and word->bytes
375
    movd     [r0], mm3                     ; store
376

    
377
    ; go to next line
378
    add        r0, r1
379
    add        r2, r3
380
    dec        r4                          ; next row
381
    jg .nextrow
382
    REP_RET
383

    
384
; 4x4 block, H-only 6-tap filter
385
cglobal put_vp8_epel4_h6_mmxext, 6, 6
386
    lea       r5d, [r5*3]
387
%ifdef PIC
388
    lea       r11, [sixtap_filter_hw_m]
389
%endif
390
    movq      mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
391
    movq      mm5, [sixtap_filter_hw+r5*8-32]
392
    movq      mm6, [sixtap_filter_hw+r5*8-16]
393
    movq      mm7, [pw_64]
394
    pxor      mm3, mm3
395

    
396
.nextrow
397
    movq      mm1, [r2-2]                  ; (ABCDEFGH) load 8 horizontal pixels
398

    
399
    ; first set of 2 pixels
400
    movq      mm2, mm1                     ; byte ABCD..
401
    punpcklbw mm1, mm3                     ; byte->word ABCD
402
    pshufw    mm0, mm2, 0x9                ; byte CDEF..
403
    punpckhbw mm2, mm3                     ; byte->word EFGH
404
    punpcklbw mm0, mm3                     ; byte->word CDEF
405
    pshufw    mm1, mm1, 0x94               ; word ABBC
406
    pshufw    mm2, mm2, 0x94               ; word EFFG
407
    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
408
    pshufw    mm3, mm0, 0x94               ; word CDDE
409
    movq      mm0, mm3                     ; backup for second set of pixels
410
    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
411
    paddd     mm1, mm3                     ; add to 1st 2px cache
412
    movq      mm3, mm2                     ; backup for second set of pixels
413
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
414
    paddd     mm1, mm2                     ; finish 1st 2px
415

    
416
    ; second set of 2 pixels, use backup of above
417
    movd      mm2, [r2+3]                  ; byte FGHI (prevent overreads)
418
    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
419
    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
420
    paddd     mm0, mm3                     ; add to 2nd 2px cache
421
    pxor      mm3, mm3
422
    punpcklbw mm2, mm3                     ; byte->word FGHI
423
    pshufw    mm2, mm2, 0xE9               ; word GHHI
424
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
425
    paddd     mm0, mm2                     ; finish 2nd 2px
426

    
427
    ; merge two sets of 2 pixels into one set of 4, round/clip/store
428
    packssdw  mm1, mm0                     ; merge dword->word (4px)
429
    paddsw    mm1, mm7                     ; rounding
430
    psraw     mm1, 7
431
    packuswb  mm1, mm3                     ; clip and word->bytes
432
    movd     [r0], mm1                     ; store
433

    
434
    ; go to next line
435
    add        r0, r1
436
    add        r2, r3
437
    dec        r4                          ; next row
438
    jg .nextrow
439
    REP_RET
440

    
441
; 4x4 block, H-only 4-tap filter
442
INIT_XMM
443
cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
444
    shl      r5d, 4
445
%ifdef PIC
446
    lea      r11, [fourtap_filter_hw_m]
447
%endif
448
    mova      m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
449
    mova      m6, [fourtap_filter_hw+r5]
450
    pxor      m7, m7
451

    
452
.nextrow
453
    movh      m0, [r2-1]
454
    punpcklbw m0, m7        ; ABCDEFGH
455
    mova      m1, m0
456
    mova      m2, m0
457
    mova      m3, m0
458
    psrldq    m1, 2         ; BCDEFGH
459
    psrldq    m2, 4         ; CDEFGH
460
    psrldq    m3, 6         ; DEFGH
461
    punpcklwd m0, m1        ; ABBCCDDE
462
    punpcklwd m2, m3        ; CDDEEFFG
463
    pmaddwd   m0, m5
464
    pmaddwd   m2, m6
465
    paddd     m0, m2
466

    
467
    movh      m1, [r2+3]
468
    punpcklbw m1, m7        ; ABCDEFGH
469
    mova      m2, m1
470
    mova      m3, m1
471
    mova      m4, m1
472
    psrldq    m2, 2         ; BCDEFGH
473
    psrldq    m3, 4         ; CDEFGH
474
    psrldq    m4, 6         ; DEFGH
475
    punpcklwd m1, m2        ; ABBCCDDE
476
    punpcklwd m3, m4        ; CDDEEFFG
477
    pmaddwd   m1, m5
478
    pmaddwd   m3, m6
479
    paddd     m1, m3
480

    
481
    packssdw  m0, m1
482
    paddsw    m0, [pw_64]
483
    psraw     m0, 7
484
    packuswb  m0, m7
485
    movh    [r0], m0        ; store
486

    
487
    ; go to next line
488
    add       r0, r1
489
    add       r2, r3
490
    dec       r4            ; next row
491
    jg .nextrow
492
    REP_RET
493

    
494
cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
495
    lea      r5d, [r5*3]
496
%ifdef PIC
497
    lea      r11, [sixtap_filter_hw_m]
498
%endif
499
    lea       r5, [sixtap_filter_hw+r5*8]
500
    pxor      m7, m7
501

    
502
.nextrow
503
    movu      m0, [r2-2]
504
    mova      m6, m0
505
    mova      m4, m0
506
    punpcklbw m0, m7        ; ABCDEFGHI
507
    mova      m1, m0
508
    mova      m2, m0
509
    mova      m3, m0
510
    psrldq    m1, 2         ; BCDEFGH
511
    psrldq    m2, 4         ; CDEFGH
512
    psrldq    m3, 6         ; DEFGH
513
    psrldq    m4, 4
514
    punpcklbw m4, m7        ; EFGH
515
    mova      m5, m4
516
    psrldq    m5, 2         ; FGH
517
    punpcklwd m0, m1        ; ABBCCDDE
518
    punpcklwd m2, m3        ; CDDEEFFG
519
    punpcklwd m4, m5        ; EFFGGHHI
520
    pmaddwd   m0, [r5-48]
521
    pmaddwd   m2, [r5-32]
522
    pmaddwd   m4, [r5-16]
523
    paddd     m0, m2
524
    paddd     m0, m4
525

    
526
    psrldq    m6, 4
527
    mova      m4, m6
528
    punpcklbw m6, m7        ; ABCDEFGHI
529
    mova      m1, m6
530
    mova      m2, m6
531
    mova      m3, m6
532
    psrldq    m1, 2         ; BCDEFGH
533
    psrldq    m2, 4         ; CDEFGH
534
    psrldq    m3, 6         ; DEFGH
535
    psrldq    m4, 4
536
    punpcklbw m4, m7        ; EFGH
537
    mova      m5, m4
538
    psrldq    m5, 2         ; FGH
539
    punpcklwd m6, m1        ; ABBCCDDE
540
    punpcklwd m2, m3        ; CDDEEFFG
541
    punpcklwd m4, m5        ; EFFGGHHI
542
    pmaddwd   m6, [r5-48]
543
    pmaddwd   m2, [r5-32]
544
    pmaddwd   m4, [r5-16]
545
    paddd     m6, m2
546
    paddd     m6, m4
547

    
548
    packssdw  m0, m6
549
    paddsw    m0, [pw_64]
550
    psraw     m0, 7
551
    packuswb  m0, m7
552
    movh    [r0], m0        ; store
553

    
554
    ; go to next line
555
    add       r0, r1
556
    add       r2, r3
557
    dec       r4            ; next row
558
    jg .nextrow
559
    REP_RET
560

    
561
%macro FILTER_V 3
562
; 4x4 block, V-only 4-tap filter
563
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
564
    shl      r6d, 5
565
%ifdef PIC
566
    lea      r11, [fourtap_filter_v_m]
567
%endif
568
    lea       r6, [fourtap_filter_v+r6-32]
569
    mova      m6, [pw_64]
570
    pxor      m7, m7
571
    mova      m5, [r6+48]
572

    
573
    ; read 3 lines
574
    sub       r2, r3
575
    movh      m0, [r2]
576
    movh      m1, [r2+  r3]
577
    movh      m2, [r2+2*r3]
578
    add       r2, r3
579
    punpcklbw m0, m7
580
    punpcklbw m1, m7
581
    punpcklbw m2, m7
582

    
583
.nextrow
584
    ; first calculate negative taps (to prevent losing positive overflows)
585
    movh      m4, [r2+2*r3]                ; read new row
586
    punpcklbw m4, m7
587
    mova      m3, m4
588
    pmullw    m0, [r6+0]
589
    pmullw    m4, m5
590
    paddsw    m4, m0
591

    
592
    ; then calculate positive taps
593
    mova      m0, m1
594
    pmullw    m1, [r6+16]
595
    paddsw    m4, m1
596
    mova      m1, m2
597
    pmullw    m2, [r6+32]
598
    paddsw    m4, m2
599
    mova      m2, m3
600

    
601
    ; round/clip/store
602
    paddsw    m4, m6
603
    psraw     m4, 7
604
    packuswb  m4, m7
605
    movh    [r0], m4
606

    
607
    ; go to next line
608
    add       r0, r1
609
    add       r2, r3
610
    dec       r4                           ; next row
611
    jg .nextrow
612
    REP_RET
613

    
614

    
615
; 4x4 block, V-only 6-tap filter
616
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
617
    shl      r6d, 4
618
    lea       r6, [r6*3]
619
%ifdef PIC
620
    lea      r11, [sixtap_filter_v_m]
621
%endif
622
    lea       r6, [sixtap_filter_v+r6-96]
623
    pxor      m7, m7
624

    
625
    ; read 5 lines
626
    sub       r2, r3
627
    sub       r2, r3
628
    movh      m0, [r2]
629
    movh      m1, [r2+r3]
630
    movh      m2, [r2+r3*2]
631
    lea       r2, [r2+r3*2]
632
    add       r2, r3
633
    movh      m3, [r2]
634
    movh      m4, [r2+r3]
635
    punpcklbw m0, m7
636
    punpcklbw m1, m7
637
    punpcklbw m2, m7
638
    punpcklbw m3, m7
639
    punpcklbw m4, m7
640

    
641
.nextrow
642
    ; first calculate negative taps (to prevent losing positive overflows)
643
    mova      m5, m1
644
    pmullw    m5, [r6+16]
645
    mova      m6, m4
646
    pmullw    m6, [r6+64]
647
    paddsw    m6, m5
648

    
649
    ; then calculate positive taps
650
    movh      m5, [r2+2*r3]                ; read new row
651
    punpcklbw m5, m7
652
    pmullw    m0, [r6+0]
653
    paddsw    m6, m0
654
    mova      m0, m1
655
    mova      m1, m2
656
    pmullw    m2, [r6+32]
657
    paddsw    m6, m2
658
    mova      m2, m3
659
    pmullw    m3, [r6+48]
660
    paddsw    m6, m3
661
    mova      m3, m4
662
    mova      m4, m5
663
    pmullw    m5, [r6+80]
664
    paddsw    m6, m5
665

    
666
    ; round/clip/store
667
    paddsw    m6, [pw_64]
668
    psraw     m6, 7
669
    packuswb  m6, m7
670
    movh    [r0], m6
671

    
672
    ; go to next line
673
    add       r0, r1
674
    add       r2, r3
675
    dec       r4                           ; next row
676
    jg .nextrow
677
    REP_RET
678
%endmacro
679

    
680
INIT_MMX
681
FILTER_V mmxext, 4, 0
682
INIT_XMM
683
FILTER_V sse2,   8, 8
684

    
685
%macro FILTER_BILINEAR 3
686
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
687
    mov      r5d, 8*16
688
    shl      r6d, 4
689
    sub      r5d, r6d
690
%ifdef PIC
691
    lea      r11, [bilinear_filter_vw_m]
692
%endif
693
    pxor      m6, m6
694
    mova      m4, [bilinear_filter_vw+r5-16]
695
    mova      m5, [bilinear_filter_vw+r6-16]
696
.nextrow
697
    movh      m0, [r2+r3*0]
698
    movh      m1, [r2+r3*1]
699
    movh      m3, [r2+r3*2]
700
    punpcklbw m0, m6
701
    punpcklbw m1, m6
702
    punpcklbw m3, m6
703
    mova      m2, m1
704
    pmullw    m0, m4
705
    pmullw    m1, m5
706
    pmullw    m2, m4
707
    pmullw    m3, m5
708
    paddsw    m0, m1
709
    paddsw    m2, m3
710
    psraw     m0, 2
711
    psraw     m2, 2
712
    pavgw     m0, m6
713
    pavgw     m2, m6
714
%ifidn %1, mmxext
715
    packuswb  m0, m0
716
    packuswb  m2, m2
717
    movh [r0+r1*0], m0
718
    movh [r0+r1*1], m2
719
%else
720
    packuswb  m0, m2
721
    movh   [r0+r1*0], m0
722
    movhps [r0+r1*1], m0
723
%endif
724

    
725
    lea       r0, [r0+r1*2]
726
    lea       r2, [r2+r3*2]
727
    sub       r4, 2
728
    jg .nextrow
729
    REP_RET
730

    
731
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
732
    mov      r6d, 8*16
733
    shl      r5d, 4
734
    sub      r6d, r5d
735
%ifdef PIC
736
    lea      r11, [bilinear_filter_vw_m]
737
%endif
738
    pxor      m6, m6
739
    mova      m4, [bilinear_filter_vw+r6-16]
740
    mova      m5, [bilinear_filter_vw+r5-16]
741
.nextrow
742
    movh      m0, [r2+r3*0+0]
743
    movh      m1, [r2+r3*0+1]
744
    movh      m2, [r2+r3*1+0]
745
    movh      m3, [r2+r3*1+1]
746
    punpcklbw m0, m6
747
    punpcklbw m1, m6
748
    punpcklbw m2, m6
749
    punpcklbw m3, m6
750
    pmullw    m0, m4
751
    pmullw    m1, m5
752
    pmullw    m2, m4
753
    pmullw    m3, m5
754
    paddsw    m0, m1
755
    paddsw    m2, m3
756
    psraw     m0, 2
757
    psraw     m2, 2
758
    pavgw     m0, m6
759
    pavgw     m2, m6
760
%ifidn %1, mmxext
761
    packuswb  m0, m0
762
    packuswb  m2, m2
763
    movh [r0+r1*0], m0
764
    movh [r0+r1*1], m2
765
%else
766
    packuswb  m0, m2
767
    movh   [r0+r1*0], m0
768
    movhps [r0+r1*1], m0
769
%endif
770

    
771
    lea       r0, [r0+r1*2]
772
    lea       r2, [r2+r3*2]
773
    sub       r4, 2
774
    jg .nextrow
775
    REP_RET
776
%endmacro
777

    
778
INIT_MMX
779
FILTER_BILINEAR mmxext, 4, 0
780
INIT_XMM
781
FILTER_BILINEAR   sse2, 8, 7
782

    
783
%macro FILTER_BILINEAR_SSSE3 1
784
cglobal put_vp8_bilinear%1_v_ssse3, 7,7
785
    shl      r6d, 4
786
%ifdef PIC
787
    lea      r11, [bilinear_filter_vb_m]
788
%endif
789
    pxor      m4, m4
790
    mova      m3, [bilinear_filter_vb+r6-16]
791
.nextrow
792
    movh      m0, [r2+r3*0]
793
    movh      m1, [r2+r3*1]
794
    movh      m2, [r2+r3*2]
795
    punpcklbw m0, m1
796
    punpcklbw m1, m2
797
    pmaddubsw m0, m3
798
    pmaddubsw m1, m3
799
    psraw     m0, 2
800
    psraw     m1, 2
801
    pavgw     m0, m4
802
    pavgw     m1, m4
803
%if mmsize==8
804
    packuswb  m0, m0
805
    packuswb  m1, m1
806
    movh [r0+r1*0], m0
807
    movh [r0+r1*1], m1
808
%else
809
    packuswb  m0, m1
810
    movh   [r0+r1*0], m0
811
    movhps [r0+r1*1], m0
812
%endif
813

    
814
    lea       r0, [r0+r1*2]
815
    lea       r2, [r2+r3*2]
816
    sub       r4, 2
817
    jg .nextrow
818
    REP_RET
819

    
820
cglobal put_vp8_bilinear%1_h_ssse3, 7,7
821
    shl      r5d, 4
822
%ifdef PIC
823
    lea      r11, [bilinear_filter_vb_m]
824
%endif
825
    pxor      m4, m4
826
    mova      m2, [filter_h2_shuf]
827
    mova      m3, [bilinear_filter_vb+r5-16]
828
.nextrow
829
    movu      m0, [r2+r3*0]
830
    movu      m1, [r2+r3*1]
831
    pshufb    m0, m2
832
    pshufb    m1, m2
833
    pmaddubsw m0, m3
834
    pmaddubsw m1, m3
835
    psraw     m0, 2
836
    psraw     m1, 2
837
    pavgw     m0, m4
838
    pavgw     m1, m4
839
%if mmsize==8
840
    packuswb  m0, m0
841
    packuswb  m1, m1
842
    movh [r0+r1*0], m0
843
    movh [r0+r1*1], m1
844
%else
845
    packuswb  m0, m1
846
    movh   [r0+r1*0], m0
847
    movhps [r0+r1*1], m0
848
%endif
849

    
850
    lea       r0, [r0+r1*2]
851
    lea       r2, [r2+r3*2]
852
    sub       r4, 2
853
    jg .nextrow
854
    REP_RET
855
%endmacro
856

    
857
INIT_MMX
858
FILTER_BILINEAR_SSSE3 4
859
INIT_XMM
860
FILTER_BILINEAR_SSSE3 8
861

    
862
cglobal put_vp8_pixels8_mmx, 5,5
863
.nextrow:
864
    movq  mm0, [r2+r3*0]
865
    movq  mm1, [r2+r3*1]
866
    lea    r2, [r2+r3*2]
867
    movq [r0+r1*0], mm0
868
    movq [r0+r1*1], mm1
869
    lea    r0, [r0+r1*2]
870
    sub   r4d, 2
871
    jg .nextrow
872
    REP_RET
873

    
874
cglobal put_vp8_pixels16_mmx, 5,5
875
.nextrow:
876
    movq  mm0, [r2+r3*0+0]
877
    movq  mm1, [r2+r3*0+8]
878
    movq  mm2, [r2+r3*1+0]
879
    movq  mm3, [r2+r3*1+8]
880
    lea    r2, [r2+r3*2]
881
    movq [r0+r1*0+0], mm0
882
    movq [r0+r1*0+8], mm1
883
    movq [r0+r1*1+0], mm2
884
    movq [r0+r1*1+8], mm3
885
    lea    r0, [r0+r1*2]
886
    sub   r4d, 2
887
    jg .nextrow
888
    REP_RET
889

    
890
cglobal put_vp8_pixels16_sse, 5,5,2
891
.nextrow:
892
    movups xmm0, [r2+r3*0]
893
    movups xmm1, [r2+r3*1]
894
    lea     r2, [r2+r3*2]
895
    movaps [r0+r1*0], xmm0
896
    movaps [r0+r1*1], xmm1
897
    lea     r0, [r0+r1*2]
898
    sub    r4d, 2
899
    jg .nextrow
900
    REP_RET
901

    
902
;-----------------------------------------------------------------------------
903
; IDCT functions:
904
;
905
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
906
;-----------------------------------------------------------------------------
907

    
908
cglobal vp8_idct_dc_add_mmx, 3, 3
909
    ; load data
910
    movd       mm0, [r1]
911

    
912
    ; calculate DC
913
    paddw      mm0, [pw_4]
914
    pxor       mm1, mm1
915
    psraw      mm0, 3
916
    psubw      mm1, mm0
917
    packuswb   mm0, mm0
918
    packuswb   mm1, mm1
919
    punpcklbw  mm0, mm0
920
    punpcklbw  mm1, mm1
921
    punpcklwd  mm0, mm0
922
    punpcklwd  mm1, mm1
923

    
924
    ; add DC
925
    lea         r1, [r0+r2*2]
926
    movd       mm2, [r0]
927
    movd       mm3, [r0+r2]
928
    movd       mm4, [r1]
929
    movd       mm5, [r1+r2]
930
    paddusb    mm2, mm0
931
    paddusb    mm3, mm0
932
    paddusb    mm4, mm0
933
    paddusb    mm5, mm0
934
    psubusb    mm2, mm1
935
    psubusb    mm3, mm1
936
    psubusb    mm4, mm1
937
    psubusb    mm5, mm1
938
    movd      [r0], mm2
939
    movd   [r0+r2], mm3
940
    movd      [r1], mm4
941
    movd   [r1+r2], mm5
942
    RET
943

    
944
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
945
    ; load data
946
    movd       xmm0, [r1]
947
    lea          r1, [r0+r2*2]
948
    pxor       xmm1, xmm1
949

    
950
    ; calculate DC
951
    paddw      xmm0, [pw_4]
952
    movd       xmm2, [r0]
953
    movd       xmm3, [r0+r2]
954
    movd       xmm4, [r1]
955
    movd       xmm5, [r1+r2]
956
    psraw      xmm0, 3
957
    pshuflw    xmm0, xmm0, 0
958
    punpcklqdq xmm0, xmm0
959
    punpckldq  xmm2, xmm3
960
    punpckldq  xmm4, xmm5
961
    punpcklbw  xmm2, xmm1
962
    punpcklbw  xmm4, xmm1
963
    paddw      xmm2, xmm0
964
    paddw      xmm4, xmm0
965
    packuswb   xmm2, xmm4
966
    movd       [r0], xmm2
967
    pextrd  [r0+r2], xmm2, 1
968
    pextrd     [r1], xmm2, 2
969
    pextrd  [r1+r2], xmm2, 3
970
    RET
971

    
972
;-----------------------------------------------------------------------------
973
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
974
;-----------------------------------------------------------------------------
975

    
976
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
977
;           this macro assumes that m6/m7 have words for 20091/17734 loaded
978
%macro VP8_MULTIPLY_SUMSUB 4
979
    mova      %3, %1
980
    mova      %4, %2
981
    pmulhw    %3, m6 ;20091(1)
982
    pmulhw    %4, m6 ;20091(2)
983
    paddw     %3, %1
984
    paddw     %4, %2
985
    paddw     %1, %1
986
    paddw     %2, %2
987
    pmulhw    %1, m7 ;35468(1)
988
    pmulhw    %2, m7 ;35468(2)
989
    psubw     %1, %4
990
    paddw     %2, %3
991
%endmacro
992

    
993
; calculate x0=%1+%3; x1=%1-%3
994
;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
995
;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
996
;           %5/%6 are temporary registers
997
;           we assume m6/m7 have constant words 20091/17734 loaded in them
998
%macro VP8_IDCT_TRANSFORM4x4_1D 6
999
    SUMSUB_BA           m%3, m%1, m%5     ;t0, t1
1000
    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
1001
    SUMSUB_BA           m%4, m%3, m%5     ;tmp0, tmp3
1002
    SUMSUB_BA           m%2, m%1, m%5     ;tmp1, tmp2
1003
    SWAP                 %4,  %1
1004
    SWAP                 %4,  %3
1005
%endmacro
1006

    
1007
INIT_MMX
1008
cglobal vp8_idct_add_mmx, 3, 3
1009
    ; load block data
1010
    movq         m0, [r1]
1011
    movq         m1, [r1+8]
1012
    movq         m2, [r1+16]
1013
    movq         m3, [r1+24]
1014
    movq         m6, [pw_20091]
1015
    movq         m7, [pw_17734]
1016

    
1017
    ; actual IDCT
1018
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1019
    TRANSPOSE4x4W            0, 1, 2, 3, 4
1020
    paddw        m0, [pw_4]
1021
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1022
    TRANSPOSE4x4W            0, 1, 2, 3, 4
1023

    
1024
    ; store
1025
    pxor         m4, m4
1026
    lea          r1, [r0+2*r2]
1027
    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
1028
    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
1029

    
1030
    RET
1031

    
1032
;-----------------------------------------------------------------------------
1033
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
1034
;-----------------------------------------------------------------------------
1035

    
1036
%macro SCATTER_WHT 3
1037
    movd  r1d, m%1
1038
    movd  r2d, m%2
1039
    mov [r0+2*16*(0+%3)], r1w
1040
    mov [r0+2*16*(1+%3)], r2w
1041
    shr   r1d, 16
1042
    shr   r2d, 16
1043
    psrlq m%1, 32
1044
    psrlq m%2, 32
1045
    mov [r0+2*16*(4+%3)], r1w
1046
    mov [r0+2*16*(5+%3)], r2w
1047
    movd  r1d, m%1
1048
    movd  r2d, m%2
1049
    mov [r0+2*16*(8+%3)], r1w
1050
    mov [r0+2*16*(9+%3)], r2w
1051
    shr   r1d, 16
1052
    shr   r2d, 16
1053
    mov [r0+2*16*(12+%3)], r1w
1054
    mov [r0+2*16*(13+%3)], r2w
1055
%endmacro
1056

    
1057
%macro HADAMARD4_1D 4
1058
    SUMSUB_BADC m%2, m%1, m%4, m%3
1059
    SUMSUB_BADC m%4, m%2, m%3, m%1
1060
    SWAP %1, %4, %3
1061
%endmacro
1062

    
1063
INIT_MMX
1064
cglobal vp8_luma_dc_wht_mmx, 2,3
1065
    movq          m0, [r1]
1066
    movq          m1, [r1+8]
1067
    movq          m2, [r1+16]
1068
    movq          m3, [r1+24]
1069
    HADAMARD4_1D  0, 1, 2, 3
1070
    TRANSPOSE4x4W 0, 1, 2, 3, 4
1071
    paddw         m0, [pw_3]
1072
    HADAMARD4_1D  0, 1, 2, 3
1073
    psraw         m0, 3
1074
    psraw         m1, 3
1075
    psraw         m2, 3
1076
    psraw         m3, 3
1077
    SCATTER_WHT   0, 1, 0
1078
    SCATTER_WHT   2, 3, 2
1079
    RET
1080

    
1081
;-----------------------------------------------------------------------------
1082
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
1083
;-----------------------------------------------------------------------------
1084

    
1085
; macro called with 7 mm register indexes as argument, and 4 regular registers
1086
;
1087
; first 4 mm registers will carry the transposed pixel data
1088
; the other three are scratchspace (one would be sufficient, but this allows
1089
; for more spreading/pipelining and thus faster execution on OOE CPUs)
1090
;
1091
; first two regular registers are buf+4*stride and buf+5*stride
1092
; third is -stride, fourth is +stride
1093
%macro READ_8x4_INTERLEAVED 11
1094
    ; interleave 8 (A-H) rows of 4 pixels each
1095
    movd          m%1, [%8+%10*4]   ; A0-3
1096
    movd          m%5, [%9+%10*4]   ; B0-3
1097
    movd          m%2, [%8+%10*2]   ; C0-3
1098
    movd          m%6, [%8+%10]     ; D0-3
1099
    movd          m%3, [%8]         ; E0-3
1100
    movd          m%7, [%9]         ; F0-3
1101
    movd          m%4, [%9+%11]     ; G0-3
1102
    punpcklbw     m%1, m%5          ; A/B interleaved
1103
    movd          m%5, [%9+%11*2]   ; H0-3
1104
    punpcklbw     m%2, m%6          ; C/D interleaved
1105
    punpcklbw     m%3, m%7          ; E/F interleaved
1106
    punpcklbw     m%4, m%5          ; G/H interleaved
1107
%endmacro
1108

    
1109
; macro called with 7 mm register indexes as argument, and 5 regular registers
1110
; first 11 mean the same as READ_8x4_TRANSPOSED above
1111
; fifth regular register is scratchspace to reach the bottom 8 rows, it
1112
; will be set to second regular register + 8*stride at the end
1113
%macro READ_16x4_INTERLEAVED 12
1114
    ; transpose 16 (A-P) rows of 4 pixels each
1115
    lea           %12, [r0+8*r2]
1116

    
1117
    ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
1118
    movd          m%1, [%8+%10*4]   ; A0-3
1119
    movd          m%3, [%12+%10*4]  ; I0-3
1120
    movd          m%2, [%8+%10*2]   ; C0-3
1121
    movd          m%4, [%12+%10*2]  ; K0-3
1122
    movd          m%6, [%8+%10]     ; D0-3
1123
    movd          m%5, [%12+%10]    ; L0-3
1124
    movd          m%7, [%12]        ; M0-3
1125
    add           %12, %11
1126
    punpcklbw     m%1, m%3          ; A/I
1127
    movd          m%3, [%8]         ; E0-3
1128
    punpcklbw     m%2, m%4          ; C/K
1129
    punpcklbw     m%6, m%5          ; D/L
1130
    punpcklbw     m%3, m%7          ; E/M
1131
    punpcklbw     m%2, m%6          ; C/D/K/L interleaved
1132

    
1133
    ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
1134
    movd         m%5, [%9+%10*4]   ; B0-3
1135
    movd         m%4, [%12+%10*4]  ; J0-3
1136
    movd         m%7, [%9]         ; F0-3
1137
    movd         m%6, [%12]        ; N0-3
1138
    punpcklbw    m%5, m%4          ; B/J
1139
    punpcklbw    m%7, m%6          ; F/N
1140
    punpcklbw    m%1, m%5          ; A/B/I/J interleaved
1141
    punpcklbw    m%3, m%7          ; E/F/M/N interleaved
1142
    movd         m%4, [%9+%11]     ; G0-3
1143
    movd         m%6, [%12+%11]    ; O0-3
1144
    movd         m%5, [%9+%11*2]   ; H0-3
1145
    movd         m%7, [%12+%11*2]  ; P0-3
1146
    punpcklbw    m%4, m%6          ; G/O
1147
    punpcklbw    m%5, m%7          ; H/P
1148
    punpcklbw    m%4, m%5          ; G/H/O/P interleaved
1149
%endmacro
1150

    
1151
; write 4 mm registers of 2 dwords each
1152
; first four arguments are mm register indexes containing source data
1153
; last four are registers containing buf+4*stride, buf+5*stride,
1154
; -stride and +stride
1155
%macro WRITE_4x2D 8
1156
    ; write out (2 dwords per register)
1157
    movd    [%5+%7*4], m%1
1158
    movd    [%5+%7*2], m%2
1159
    movd         [%5], m%3
1160
    movd      [%6+%8], m%4
1161
    punpckhdq     m%1, m%1
1162
    punpckhdq     m%2, m%2
1163
    punpckhdq     m%3, m%3
1164
    punpckhdq     m%4, m%4
1165
    movd    [%6+%7*4], m%1
1166
    movd      [%5+%7], m%2
1167
    movd         [%6], m%3
1168
    movd    [%6+%8*2], m%4
1169
%endmacro
1170

    
1171
; write 4 xmm registers of 4 dwords each
1172
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
1173
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
1174
; we add 1*stride to the third regular registry in the process
1175
; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
1176
; same memory region), or 8 if they cover two separate buffers (third one points to
1177
; a different memory region than the first two), allowing for more optimal code for
1178
; the 16-width case
1179
%macro WRITE_4x4D 10
1180
    ; write out (4 dwords per register), start with dwords zero
1181
    movd    [%5+%8*4], m%1
1182
    movd         [%5], m%2
1183
    movd    [%7+%8*4], m%3
1184
    movd         [%7], m%4
1185

    
1186
    ; store dwords 1
1187
    psrldq        m%1, 4
1188
    psrldq        m%2, 4
1189
    psrldq        m%3, 4
1190
    psrldq        m%4, 4
1191
    movd    [%6+%8*4], m%1
1192
    movd         [%6], m%2
1193
%if %10 == 16
1194
    movd    [%6+%9*4], m%3
1195
%endif
1196
    movd      [%7+%9], m%4
1197

    
1198
    ; write dwords 2
1199
    psrldq        m%1, 4
1200
    psrldq        m%2, 4
1201
%if %10 == 8
1202
    movd    [%5+%8*2], m%1
1203
    movd           %5, m%3
1204
%endif
1205
    psrldq        m%3, 4
1206
    psrldq        m%4, 4
1207
%if %10 == 16
1208
    movd    [%5+%8*2], m%1
1209
%endif
1210
    movd      [%6+%9], m%2
1211
    movd    [%7+%8*2], m%3
1212
    movd    [%7+%9*2], m%4
1213
    add            %7, %9
1214

    
1215
    ; store dwords 3
1216
    psrldq        m%1, 4
1217
    psrldq        m%2, 4
1218
    psrldq        m%3, 4
1219
    psrldq        m%4, 4
1220
%if %10 == 8
1221
    mov     [%7+%8*4], %5d
1222
    movd    [%6+%8*2], m%1
1223
%else
1224
    movd      [%5+%8], m%1
1225
%endif
1226
    movd    [%6+%9*2], m%2
1227
    movd    [%7+%8*2], m%3
1228
    movd    [%7+%9*2], m%4
1229
%endmacro
1230

    
1231
%macro SPLATB_REG 3-4
1232
    movd           %1, %2
1233
%ifidn %3, ssse3
1234
    pshufb         %1, %4
1235
%else
1236
    punpcklbw      %1, %1
1237
%if mmsize == 16 ; sse2
1238
    pshuflw        %1, %1, 0x0
1239
    punpcklqdq     %1, %1
1240
%elifidn %3, mmx
1241
    punpcklwd      %1, %1
1242
    punpckldq      %1, %1
1243
%else ; mmxext
1244
    pshufw         %1, %1, 0x0
1245
%endif
1246
%endif
1247
%endmacro
1248

    
1249
%macro SIMPLE_LOOPFILTER 3
1250
cglobal vp8_%2_loop_filter_simple_%1, 3, %3
1251
%ifidn %2, h
1252
    mov            r5, rsp          ; backup stack pointer
1253
    and           rsp, ~(mmsize-1)  ; align stack
1254
%endif
1255
%if mmsize == 8 ; mmx/mmxext
1256
    mov            r3, 2
1257
%endif
1258
%ifidn %1, ssse3
1259
    pxor           m0, m0
1260
%endif
1261
    SPLATB_REG     m7, r2, %1, m0   ; splat "flim" into register
1262

    
1263
    ; set up indexes to address 4 rows
1264
    mov            r2, r1
1265
    neg            r1
1266
%ifidn %2, h
1267
    lea            r0, [r0+4*r2-2]
1268
    sub           rsp, mmsize*2     ; (aligned) storage space for saving p1/q1
1269
%endif
1270

    
1271
%if mmsize == 8 ; mmx / mmxext
1272
.next8px
1273
%endif
1274
%ifidn %2, v
1275
    ; read 4 half/full rows of pixels
1276
    mova           m0, [r0+r1*2]    ; p1
1277
    mova           m1, [r0+r1]      ; p0
1278
    mova           m2, [r0]         ; q0
1279
    mova           m3, [r0+r2]      ; q1
1280
%else ; h
1281
    lea            r4, [r0+r2]
1282

    
1283
%if mmsize == 8 ; mmx/mmxext
1284
    READ_8x4_INTERLEAVED  0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
1285
%else ; sse2
1286
    READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
1287
%endif
1288
    TRANSPOSE4x4W         0, 1, 2, 3, 4
1289

    
1290
    mova        [rsp], m0           ; store p1
1291
    mova [rsp+mmsize], m3           ; store q1
1292
%endif
1293

    
1294
    ; simple_limit
1295
    mova           m5, m2           ; m5=backup of q0
1296
    mova           m6, m1           ; m6=backup of p0
1297
    psubusb        m1, m2           ; p0-q0
1298
    psubusb        m2, m6           ; q0-p0
1299
    por            m1, m2           ; FFABS(p0-q0)
1300
    paddusb        m1, m1           ; m1=FFABS(p0-q0)*2
1301

    
1302
    mova           m4, m3
1303
    mova           m2, m0
1304
    psubusb        m3, m0           ; q1-p1
1305
    psubusb        m0, m4           ; p1-q1
1306
    por            m3, m0           ; FFABS(p1-q1)
1307
    mova           m0, [pb_80]
1308
    pxor           m2, m0
1309
    pxor           m4, m0
1310
    psubsb         m2, m4           ; m2=p1-q1 (signed) backup for below
1311
    pand           m3, [pb_FE]
1312
    psrlq          m3, 1            ; m3=FFABS(p1-q1)/2, this can be used signed
1313
    paddusb        m3, m1
1314
    psubusb        m3, m7
1315
    pxor           m1, m1
1316
    pcmpeqb        m3, m1           ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
1317

    
1318
    ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
1319
    mova           m4, m5
1320
    pxor           m5, m0
1321
    pxor           m0, m6
1322
    psubsb         m5, m0           ; q0-p0 (signed)
1323
    paddsb         m2, m5
1324
    paddsb         m2, m5
1325
    paddsb         m2, m5           ; a=(p1-q1) + 3*(q0-p0)
1326
    pand           m2, m3           ; apply filter mask (m3)
1327

    
1328
    mova           m3, [pb_F8]
1329
    mova           m1, m2
1330
    paddsb         m2, [pb_4]       ; f1<<3=a+4
1331
    paddsb         m1, [pb_3]       ; f2<<3=a+3
1332
    pand           m2, m3
1333
    pand           m1, m3           ; cache f2<<3
1334

    
1335
    pxor           m0, m0
1336
    pxor           m3, m3
1337
    pcmpgtb        m0, m2           ; which values are <0?
1338
    psubb          m3, m2           ; -f1<<3
1339
    psrlq          m2, 3            ; +f1
1340
    psrlq          m3, 3            ; -f1
1341
    pand           m3, m0
1342
    pandn          m0, m2
1343
    psubusb        m4, m0
1344
    paddusb        m4, m3           ; q0-f1
1345

    
1346
    pxor           m0, m0
1347
    pxor           m3, m3
1348
    pcmpgtb        m0, m1           ; which values are <0?
1349
    psubb          m3, m1           ; -f2<<3
1350
    psrlq          m1, 3            ; +f2
1351
    psrlq          m3, 3            ; -f2
1352
    pand           m3, m0
1353
    pandn          m0, m1
1354
    paddusb        m6, m0
1355
    psubusb        m6, m3           ; p0+f2
1356

    
1357
    ; store
1358
%ifidn %2, v
1359
    mova         [r0], m4
1360
    mova      [r0+r1], m6
1361
%else ; h
1362
    mova           m0, [rsp]        ; p1
1363
    SWAP            2, 4            ; p0
1364
    SWAP            1, 6            ; q0
1365
    mova           m3, [rsp+mmsize] ; q1
1366

    
1367
    TRANSPOSE4x4B  0, 1, 2, 3, 4
1368
%if mmsize == 16 ; sse2
1369
    add            r3, r1           ; change from r4*8*stride to r0+8*stride
1370
    WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16
1371
%else ; mmx/mmxext
1372
    WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2
1373
%endif
1374
%endif
1375

    
1376
%if mmsize == 8 ; mmx/mmxext
1377
    ; next 8 pixels
1378
%ifidn %2, v
1379
    add            r0, 8            ; advance 8 cols = pixels
1380
%else ; h
1381
    lea            r0, [r0+r2*8]    ; advance 8 rows = lines
1382
%endif
1383
    dec            r3
1384
    jg .next8px
1385
%ifidn %2, v
1386
    REP_RET
1387
%else ; h
1388
    mov           rsp, r5           ; restore stack pointer
1389
    RET
1390
%endif
1391
%else ; sse2
1392
%ifidn %2, h
1393
    mov           rsp, r5           ; restore stack pointer
1394
%endif
1395
    RET
1396
%endif
1397
%endmacro
1398

    
1399
INIT_MMX
1400
SIMPLE_LOOPFILTER mmx,    v, 4
1401
SIMPLE_LOOPFILTER mmx,    h, 6
1402
SIMPLE_LOOPFILTER mmxext, v, 4
1403
SIMPLE_LOOPFILTER mmxext, h, 6
1404
INIT_XMM
1405
SIMPLE_LOOPFILTER sse2,   v, 3
1406
SIMPLE_LOOPFILTER sse2,   h, 6
1407
SIMPLE_LOOPFILTER ssse3,  v, 3
1408
SIMPLE_LOOPFILTER ssse3,  h, 6
1409

    
1410
;-----------------------------------------------------------------------------
1411
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
1412
;                                            int flimE, int flimI, int hev_thr);
1413
;-----------------------------------------------------------------------------
1414

    
1415
%macro INNER_LOOPFILTER 5
1416
%if %4 == 8 ; chroma
1417
cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5
1418
%define dst8_reg    r1
1419
%define mstride_reg r2
1420
%define E_reg       r3
1421
%define I_reg       r4
1422
%define hev_thr_reg r5
1423
%else ; luma
1424
cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
1425
%define mstride_reg r1
1426
%define E_reg       r2
1427
%define I_reg       r3
1428
%define hev_thr_reg r4
1429
%ifdef m8 ; x86-64, sse2
1430
%define dst8_reg    r4
1431
%elif mmsize == 16 ; x86-32, sse2
1432
%define dst8_reg    r5
1433
%else ; x86-32, mmx/mmxext
1434
%define cnt_reg     r5
1435
%endif
1436
%endif
1437
%define dst_reg     r0
1438
%define stride_reg  E_reg
1439
%define dst2_reg    I_reg
1440
%ifndef m8
1441
%define stack_reg   hev_thr_reg
1442
%endif
1443

    
1444
%ifidn %1, ssse3
1445
    pxor             m7, m7
1446
%endif
1447

    
1448
%ifndef m8 ; mmx/mmxext or sse2 on x86-32
1449
    ; splat function arguments
1450
    SPLATB_REG       m0, E_reg, %1, m7 ; E
1451
    SPLATB_REG       m1, I_reg, %1, m7 ; I
1452
    SPLATB_REG       m2, hev_thr_reg, %1, m7 ; hev_thresh
1453

    
1454
    ; align stack
1455
    mov       stack_reg, rsp         ; backup stack pointer
1456
    and             rsp, ~(mmsize-1) ; align stack
1457
%ifidn %2, v
1458
    sub             rsp, mmsize * 4  ; stack layout: [0]=E, [1]=I, [2]=hev_thr
1459
                                     ;               [3]=hev() result
1460
%else ; h
1461
    sub             rsp, mmsize * 5  ; extra storage space for transposes
1462
%endif
1463

    
1464
%define flim_E   [rsp]
1465
%define flim_I   [rsp+mmsize]
1466
%define hev_thr  [rsp+mmsize*2]
1467
%define mask_res [rsp+mmsize*3]
1468
%define p0backup [rsp+mmsize*3]
1469
%define q0backup [rsp+mmsize*4]
1470

    
1471
    mova         flim_E, m0
1472
    mova         flim_I, m1
1473
    mova        hev_thr, m2
1474

    
1475
%else ; sse2 on x86-64
1476

    
1477
%define flim_E   m9
1478
%define flim_I   m10
1479
%define hev_thr  m11
1480
%define mask_res m12
1481
%define p0backup m12
1482
%define q0backup m8
1483

    
1484
    ; splat function arguments
1485
    SPLATB_REG   flim_E, E_reg, %1, m7 ; E
1486
    SPLATB_REG   flim_I, I_reg, %1, m7 ; I
1487
    SPLATB_REG  hev_thr, hev_thr_reg, %1, m7 ; hev_thresh
1488
%endif
1489

    
1490
%if mmsize == 8 && %4 == 16 ; mmx/mmxext
1491
    mov         cnt_reg, 2
1492
%endif
1493
    mov      stride_reg, mstride_reg
1494
    neg     mstride_reg
1495
%ifidn %2, h
1496
    lea         dst_reg, [dst_reg + stride_reg*4-4]
1497
%if %4 == 8
1498
    lea        dst8_reg, [dst8_reg+ stride_reg*4-4]
1499
%endif
1500
%endif
1501

    
1502
%if mmsize == 8
1503
.next8px
1504
%endif
1505
    ; read
1506
    lea        dst2_reg, [dst_reg + stride_reg]
1507
%ifidn %2, v
1508
%if %4 == 8 && mmsize == 16
1509
%define movrow movh
1510
%else
1511
%define movrow mova
1512
%endif
1513
    movrow           m0, [dst_reg +mstride_reg*4] ; p3
1514
    movrow           m1, [dst2_reg+mstride_reg*4] ; p2
1515
    movrow           m2, [dst_reg +mstride_reg*2] ; p1
1516
    movrow           m5, [dst2_reg]               ; q1
1517
    movrow           m6, [dst2_reg+ stride_reg]   ; q2
1518
    movrow           m7, [dst2_reg+ stride_reg*2] ; q3
1519
%if mmsize == 16 && %4 == 8
1520
    movhps           m0, [dst8_reg+mstride_reg*4]
1521
    movhps           m2, [dst8_reg+mstride_reg*2]
1522
    add        dst8_reg, stride_reg
1523
    movhps           m1, [dst8_reg+mstride_reg*4]
1524
    movhps           m5, [dst8_reg]
1525
    movhps           m6, [dst8_reg+ stride_reg]
1526
    movhps           m7, [dst8_reg+ stride_reg*2]
1527
    add        dst8_reg, mstride_reg
1528
%endif
1529
%elif mmsize == 8 ; mmx/mmxext (h)
1530
    ; read 8 rows of 8px each
1531
    movu             m0, [dst_reg +mstride_reg*4]
1532
    movu             m1, [dst2_reg+mstride_reg*4]
1533
    movu             m2, [dst_reg +mstride_reg*2]
1534
    movu             m3, [dst_reg +mstride_reg]
1535
    movu             m4, [dst_reg]
1536
    movu             m5, [dst2_reg]
1537
    movu             m6, [dst2_reg+ stride_reg]
1538

    
1539
    ; 8x8 transpose
1540
    TRANSPOSE4x4B     0, 1, 2, 3, 7
1541
    mova       q0backup, m1
1542
    movu             m7, [dst2_reg+ stride_reg*2]
1543
    TRANSPOSE4x4B     4, 5, 6, 7, 1
1544
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
1545
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
1546
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1547
    mova             m1, q0backup
1548
    mova       q0backup, m2          ; store q0
1549
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1550
    mova       p0backup, m5          ; store p0
1551
    SWAP              1, 4
1552
    SWAP              2, 4
1553
    SWAP              6, 3
1554
    SWAP              5, 3
1555
%else ; sse2 (h)
1556
%if %4 == 16
1557
    lea        dst8_reg, [dst_reg + stride_reg*8]
1558
%endif
1559

    
1560
    ; read 16 rows of 8px each, interleave
1561
    movh             m0, [dst_reg +mstride_reg*4]
1562
    movh             m1, [dst8_reg+mstride_reg*4]
1563
    movh             m2, [dst_reg +mstride_reg*2]
1564
    movh             m5, [dst8_reg+mstride_reg*2]
1565
    movh             m3, [dst_reg +mstride_reg]
1566
    movh             m6, [dst8_reg+mstride_reg]
1567
    movh             m4, [dst_reg]
1568
    movh             m7, [dst8_reg]
1569
    punpcklbw        m0, m1          ; A/I
1570
    punpcklbw        m2, m5          ; C/K
1571
    punpcklbw        m3, m6          ; D/L
1572
    punpcklbw        m4, m7          ; E/M
1573

    
1574
    add        dst8_reg, stride_reg
1575
    movh             m1, [dst2_reg+mstride_reg*4]
1576
    movh             m6, [dst8_reg+mstride_reg*4]
1577
    movh             m5, [dst2_reg]
1578
    movh             m7, [dst8_reg]
1579
    punpcklbw        m1, m6          ; B/J
1580
    punpcklbw        m5, m7          ; F/N
1581
    movh             m6, [dst2_reg+ stride_reg]
1582
    movh             m7, [dst8_reg+ stride_reg]
1583
    punpcklbw        m6, m7          ; G/O
1584

    
1585
    ; 8x16 transpose
1586
    TRANSPOSE4x4B     0, 1, 2, 3, 7
1587
%ifdef m8
1588
    SWAP              1, 8
1589
%else
1590
    mova       q0backup, m1
1591
%endif
1592
    movh             m7, [dst2_reg+ stride_reg*2]
1593
    movh             m1, [dst8_reg+ stride_reg*2]
1594
    punpcklbw        m7, m1          ; H/P
1595
    TRANSPOSE4x4B     4, 5, 6, 7, 1
1596
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
1597
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
1598
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1599
%ifdef m8
1600
    SWAP              1, 8
1601
    SWAP              2, 8
1602
%else
1603
    mova             m1, q0backup
1604
    mova       q0backup, m2          ; store q0
1605
%endif
1606
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1607
%ifdef m12
1608
    SWAP              5, 12
1609
%else
1610
    mova       p0backup, m5          ; store p0
1611
%endif
1612
    SWAP              1, 4
1613
    SWAP              2, 4
1614
    SWAP              6, 3
1615
    SWAP              5, 3
1616
%endif
1617

    
1618
    ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
1619
    mova             m4, m1
1620
    SWAP              4, 1
1621
    psubusb          m4, m0          ; p2-p3
1622
    psubusb          m0, m1          ; p3-p2
1623
    por              m0, m4          ; abs(p3-p2)
1624

    
1625
    mova             m4, m2
1626
    SWAP              4, 2
1627
    psubusb          m4, m1          ; p1-p2
1628
    psubusb          m1, m2          ; p2-p1
1629
    por              m1, m4          ; abs(p2-p1)
1630

    
1631
    mova             m4, m6
1632
    SWAP              4, 6
1633
    psubusb          m4, m7          ; q2-q3
1634
    psubusb          m7, m6          ; q3-q2
1635
    por              m7, m4          ; abs(q3-q2)
1636

    
1637
    mova             m4, m5
1638
    SWAP              4, 5
1639
    psubusb          m4, m6          ; q1-q2
1640
    psubusb          m6, m5          ; q2-q1
1641
    por              m6, m4          ; abs(q2-q1)
1642

    
1643
%ifidn %1, mmx
1644
    mova             m4, flim_I
1645
    pxor             m3, m3
1646
    psubusb          m0, m4
1647
    psubusb          m1, m4
1648
    psubusb          m7, m4
1649
    psubusb          m6, m4
1650
    pcmpeqb          m0, m3          ; abs(p3-p2) <= I
1651
    pcmpeqb          m1, m3          ; abs(p2-p1) <= I
1652
    pcmpeqb          m7, m3          ; abs(q3-q2) <= I
1653
    pcmpeqb          m6, m3          ; abs(q2-q1) <= I
1654
    pand             m0, m1
1655
    pand             m7, m6
1656
    pand             m0, m7
1657
%else ; mmxext/sse2
1658
    pmaxub           m0, m1
1659
    pmaxub           m6, m7
1660
    pmaxub           m0, m6
1661
%endif
1662

    
1663
    ; normal_limit and high_edge_variance for p1-p0, q1-q0
1664
    SWAP              7, 3           ; now m7 is zero
1665
%ifidn %2, v
1666
    movrow           m3, [dst_reg +mstride_reg] ; p0
1667
%if mmsize == 16 && %4 == 8
1668
    movhps           m3, [dst8_reg+mstride_reg]
1669
%endif
1670
%elifdef m12
1671
    SWAP              3, 12
1672
%else
1673
    mova             m3, p0backup
1674
%endif
1675

    
1676
    mova             m1, m2
1677
    SWAP              1, 2
1678
    mova             m6, m3
1679
    SWAP              3, 6
1680
    psubusb          m1, m3          ; p1-p0
1681
    psubusb          m6, m2          ; p0-p1
1682
    por              m1, m6          ; abs(p1-p0)
1683
%ifidn %1, mmx
1684
    mova             m6, m1
1685
    psubusb          m1, m4
1686
    psubusb          m6, hev_thr
1687
    pcmpeqb          m1, m7          ; abs(p1-p0) <= I
1688
    pcmpeqb          m6, m7          ; abs(p1-p0) <= hev_thresh
1689
    pand             m0, m1
1690
    mova       mask_res, m6
1691
%else ; mmxext/sse2
1692
    pmaxub           m0, m1          ; max_I
1693
    SWAP              1, 4           ; max_hev_thresh
1694
%endif
1695

    
1696
    SWAP              6, 4           ; now m6 is I
1697
%ifidn %2, v
1698
    movrow           m4, [dst_reg]   ; q0
1699
%if mmsize == 16 && %4 == 8
1700
    movhps           m4, [dst8_reg]
1701
%endif
1702
%elifdef m8
1703
    SWAP              4, 8
1704
%else
1705
    mova             m4, q0backup
1706
%endif
1707
    mova             m1, m4
1708
    SWAP              1, 4
1709
    mova             m7, m5
1710
    SWAP              7, 5
1711
    psubusb          m1, m5          ; q0-q1
1712
    psubusb          m7, m4          ; q1-q0
1713
    por              m1, m7          ; abs(q1-q0)
1714
%ifidn %1, mmx
1715
    mova             m7, m1
1716
    psubusb          m1, m6
1717
    psubusb          m7, hev_thr
1718
    pxor             m6, m6
1719
    pcmpeqb          m1, m6          ; abs(q1-q0) <= I
1720
    pcmpeqb          m7, m6          ; abs(q1-q0) <= hev_thresh
1721
    mova             m6, mask_res
1722
    pand             m0, m1          ; abs([pq][321]-[pq][210]) <= I
1723
    pand             m6, m7
1724
%else ; mmxext/sse2
1725
    pxor             m7, m7
1726
    pmaxub           m0, m1
1727
    pmaxub           m6, m1
1728
    psubusb          m0, flim_I
1729
    psubusb          m6, hev_thr
1730
    pcmpeqb          m0, m7          ; max(abs(..)) <= I
1731
    pcmpeqb          m6, m7          ; !(max(abs..) > thresh)
1732
%endif
1733
%ifdef m12
1734
    SWAP              6, 12
1735
%else
1736
    mova       mask_res, m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1737
%endif
1738

    
1739
    ; simple_limit
1740
    mova             m1, m3
1741
    SWAP              1, 3
1742
    mova             m6, m4          ; keep copies of p0/q0 around for later use
1743
    SWAP              6, 4
1744
    psubusb          m1, m4          ; p0-q0
1745
    psubusb          m6, m3          ; q0-p0
1746
    por              m1, m6          ; abs(q0-p0)
1747
    paddusb          m1, m1          ; m1=2*abs(q0-p0)
1748

    
1749
    mova             m7, m2
1750
    SWAP              7, 2
1751
    mova             m6, m5
1752
    SWAP              6, 5
1753
    psubusb          m7, m5          ; p1-q1
1754
    psubusb          m6, m2          ; q1-p1
1755
    por              m7, m6          ; abs(q1-p1)
1756
    pxor             m6, m6
1757
    pand             m7, [pb_FE]
1758
    psrlq            m7, 1           ; abs(q1-p1)/2
1759
    paddusb          m7, m1          ; abs(q0-p0)*2+abs(q1-p1)/2
1760
    psubusb          m7, flim_E
1761
    pcmpeqb          m7, m6          ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
1762
    pand             m0, m7          ; normal_limit result
1763

    
1764
    ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
1765
%ifdef m8 ; x86-64 && sse2
1766
    mova             m8, [pb_80]
1767
%define pb_80_var m8
1768
%else ; x86-32 or mmx/mmxext
1769
%define pb_80_var [pb_80]
1770
%endif
1771
    mova             m1, m4
1772
    mova             m7, m3
1773
    pxor             m1, pb_80_var
1774
    pxor             m7, pb_80_var
1775
    psubsb           m1, m7          ; (signed) q0-p0
1776
    mova             m6, m2
1777
    mova             m7, m5
1778
    pxor             m6, pb_80_var
1779
    pxor             m7, pb_80_var
1780
    psubsb           m6, m7          ; (signed) p1-q1
1781
    mova             m7, mask_res
1782
    pandn            m7, m6
1783
    paddsb           m7, m1
1784
    paddsb           m7, m1
1785
    paddsb           m7, m1          ; 3*(q0-p0)+is4tap?(p1-q1)
1786

    
1787
    pand             m7, m0
1788
    mova             m1, [pb_F8]
1789
    mova             m6, m7
1790
    paddsb           m7, [pb_3]
1791
    paddsb           m6, [pb_4]
1792
    pand             m7, m1
1793
    pand             m6, m1
1794

    
1795
    pxor             m1, m1
1796
    pxor             m0, m0
1797
    pcmpgtb          m1, m7
1798
    psubb            m0, m7
1799
    psrlq            m7, 3           ; +f2
1800
    psrlq            m0, 3           ; -f2
1801
    pand             m0, m1
1802
    pandn            m1, m7
1803
    psubusb          m3, m0
1804
    paddusb          m3, m1          ; p0+f2
1805

    
1806
    pxor             m1, m1
1807
    pxor             m0, m0
1808
    pcmpgtb          m0, m6
1809
    psubb            m1, m6
1810
    psrlq            m6, 3           ; +f1
1811
    psrlq            m1, 3           ; -f1
1812
    pand             m1, m0
1813
    pandn            m0, m6
1814
    psubusb          m4, m0
1815
    paddusb          m4, m1          ; q0-f1
1816

    
1817
%ifdef m12
1818
    SWAP              6, 12
1819
%else
1820
    mova             m6, mask_res
1821
%endif
1822
%ifidn %1, mmx
1823
    mova             m7, [pb_1]
1824
%else ; mmxext/sse2
1825
    pxor             m7, m7
1826
%endif
1827
    pand             m0, m6
1828
    pand             m1, m6
1829
%ifidn %1, mmx
1830
    paddusb          m0, m7
1831
    pand             m1, [pb_FE]
1832
    pandn            m7, m0
1833
    psrlq            m1, 1
1834
    psrlq            m7, 1
1835
    SWAP              0, 7
1836
%else ; mmxext/sse2
1837
    psubusb          m1, [pb_1]
1838
    pavgb            m0, m7          ; a
1839
    pavgb            m1, m7          ; -a
1840
%endif
1841
    psubusb          m5, m0
1842
    psubusb          m2, m1
1843
    paddusb          m5, m1          ; q1-a
1844
    paddusb          m2, m0          ; p1+a
1845

    
1846
    ; store
1847
%ifidn %2, v
1848
    movrow [dst_reg +mstride_reg*2], m2
1849
    movrow [dst_reg +mstride_reg  ], m3
1850
    movrow    [dst_reg], m4
1851
    movrow [dst_reg + stride_reg  ], m5
1852
%if mmsize == 16 && %4 == 8
1853
    movhps [dst8_reg+mstride_reg*2], m2
1854
    movhps [dst8_reg+mstride_reg  ], m3
1855
    movhps   [dst8_reg], m4
1856
    movhps [dst8_reg+ stride_reg  ], m5
1857
%endif
1858
%else ; h
1859
    add         dst_reg, 2
1860
    add        dst2_reg, 2
1861

    
1862
    ; 4x8/16 transpose
1863
    TRANSPOSE4x4B     2, 3, 4, 5, 6
1864

    
1865
%if mmsize == 8 ; mmx/mmxext (h)
1866
    WRITE_4x2D        2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
1867
%else ; sse2 (h)
1868
    lea        dst8_reg, [dst8_reg+mstride_reg+2]
1869
    WRITE_4x4D        2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
1870
%endif
1871
%endif
1872

    
1873
%if mmsize == 8
1874
%if %4 == 8 ; chroma
1875
%ifidn %2, h
1876
    sub         dst_reg, 2
1877
%endif
1878
    cmp         dst_reg, dst8_reg
1879
    mov         dst_reg, dst8_reg
1880
    jnz .next8px
1881
%else
1882
%ifidn %2, h
1883
    lea         dst_reg, [dst_reg + stride_reg*8-2]
1884
%else ; v
1885
    add         dst_reg, 8
1886
%endif
1887
    dec         cnt_reg
1888
    jg .next8px
1889
%endif
1890
%endif
1891

    
1892
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
1893
    mov             rsp, stack_reg   ; restore stack pointer
1894
%endif
1895
    RET
1896
%endmacro
1897

    
1898
INIT_MMX
1899
INNER_LOOPFILTER mmx,    v, 6, 16, 0
1900
INNER_LOOPFILTER mmx,    h, 6, 16, 0
1901
INNER_LOOPFILTER mmxext, v, 6, 16, 0
1902
INNER_LOOPFILTER mmxext, h, 6, 16, 0
1903

    
1904
INNER_LOOPFILTER mmx,    v, 6,  8, 0
1905
INNER_LOOPFILTER mmx,    h, 6,  8, 0
1906
INNER_LOOPFILTER mmxext, v, 6,  8, 0
1907
INNER_LOOPFILTER mmxext, h, 6,  8, 0
1908

    
1909
INIT_XMM
1910
INNER_LOOPFILTER sse2,   v, 5, 16, 13
1911
%ifdef m8
1912
INNER_LOOPFILTER sse2,   h, 5, 16, 13
1913
%else
1914
INNER_LOOPFILTER sse2,   h, 6, 16, 13
1915
%endif
1916
INNER_LOOPFILTER sse2,   v, 6,  8, 13
1917
INNER_LOOPFILTER sse2,   h, 6,  8, 13
1918

    
1919
INNER_LOOPFILTER ssse3,  v, 5, 16, 13
1920
%ifdef m8
1921
INNER_LOOPFILTER ssse3,  h, 5, 16, 13
1922
%else
1923
INNER_LOOPFILTER ssse3,  h, 6, 16, 13
1924
%endif
1925
INNER_LOOPFILTER ssse3,  v, 6,  8, 13
1926
INNER_LOOPFILTER ssse3,  h, 6,  8, 13
1927

    
1928
;-----------------------------------------------------------------------------
1929
; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
1930
;                                            int flimE, int flimI, int hev_thr);
1931
;-----------------------------------------------------------------------------
1932

    
1933
; write 4 or 8 words in the mmx/xmm registers as 8 lines
1934
; 1 and 2 are the registers to write, this can be the same (for SSE2)
1935
; 3 is a general-purpose register that we will clobber
1936
; 4 is a pointer to the destination's 4th line
1937
; 5 is -stride and +stride
1938
%macro WRITE_8W 6
1939
    movd             %3, %1
1940
%if mmsize == 8
1941
    punpckhdq        %1, %1
1942
%else
1943
    psrldq           %1, 4
1944
%endif
1945
    mov       [%4+%5*4], %3w
1946
    shr              %3, 16
1947
    add              %4, %6
1948
    mov       [%4+%5*4], %3w
1949

    
1950
    movd             %3, %1
1951
%if mmsize == 16
1952
    psrldq           %1, 4
1953
%endif
1954
    add              %4, %5
1955
    mov       [%4+%5*2], %3w
1956
    shr              %3, 16
1957
    mov       [%4+%5  ], %3w
1958

    
1959
    movd             %3, %2
1960
%if mmsize == 8
1961
    punpckhdq        %2, %2
1962
%else
1963
    psrldq           %2, 4
1964
%endif
1965
    mov       [%4     ], %3w
1966
    shr              %3, 16
1967
    mov       [%4+%6  ], %3w
1968

    
1969
    movd             %3, %2
1970
    add              %4, %6
1971
    mov       [%4+%6  ], %3w
1972
    shr              %3, 16
1973
    mov       [%4+%6*2], %3w
1974
%if mmsize == 8
1975
    add              %4, %5
1976
%endif
1977
%endmacro
1978

    
1979
%macro MBEDGE_LOOPFILTER 5
1980
%if %4 == 8 ; chroma
1981
cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5
1982
%define dst8_reg    r1
1983
%define mstride_reg r2
1984
%define E_reg       r3
1985
%define I_reg       r4
1986
%define hev_thr_reg r5
1987
%else ; luma
1988
cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
1989
%define mstride_reg r1
1990
%define E_reg       r2
1991
%define I_reg       r3
1992
%define hev_thr_reg r4
1993
%ifdef m8 ; x86-64, sse2
1994
%define dst8_reg    r4
1995
%elif mmsize == 16 ; x86-32, sse2
1996
%define dst8_reg    r5
1997
%else ; x86-32, mmx/mmxext
1998
%define cnt_reg     r5
1999
%endif
2000
%endif
2001
%define dst_reg     r0
2002
%define stride_reg  E_reg
2003
%define dst2_reg    I_reg
2004
%ifndef m8
2005
%define stack_reg   hev_thr_reg
2006
%endif
2007

    
2008
%ifidn %1, ssse3
2009
    pxor             m7, m7
2010
%endif
2011

    
2012
%ifndef m8 ; mmx/mmxext or sse2 on x86-32
2013
    ; splat function arguments
2014
    SPLATB_REG       m0, E_reg, %1, m7 ; E
2015
    SPLATB_REG       m1, I_reg, %1, m7 ; I
2016
    SPLATB_REG       m2, hev_thr_reg, %1, m7 ; hev_thresh
2017

    
2018
    ; align stack
2019
    mov       stack_reg, rsp         ; backup stack pointer
2020
    and             rsp, ~(mmsize-1) ; align stack
2021
    sub             rsp, mmsize * 8  ; stack layout: [0]=E, [1]=I, [2]=hev_thr
2022
                                     ;               [3]=hev() result
2023
                                     ;               [4]=filter tmp result
2024
                                     ;               [5]/[6] = p2/q2 backup
2025
                                     ;               [7]=lim_res sign result
2026

    
2027
%define flim_E   [rsp]
2028
%define flim_I   [rsp+mmsize]
2029
%define hev_thr  [rsp+mmsize*2]
2030
%define mask_res [rsp+mmsize*3]
2031
%define lim_res  [rsp+mmsize*4]
2032
%define p0backup [rsp+mmsize*3]
2033
%define q0backup [rsp+mmsize*4]
2034
%define p2backup [rsp+mmsize*5]
2035
%define q2backup [rsp+mmsize*6]
2036
%define lim_sign [rsp+mmsize*7]
2037

    
2038
    mova         flim_E, m0
2039
    mova         flim_I, m1
2040
    mova        hev_thr, m2
2041

    
2042
%else ; sse2 on x86-64
2043

    
2044
%define flim_E   m9
2045
%define flim_I   m10
2046
%define hev_thr  m11
2047
%define mask_res m12
2048
%define lim_res  m8
2049
%define p0backup m12
2050
%define q0backup m8
2051
%define p2backup m13
2052
%define q2backup m14
2053
%define lim_sign m15
2054

    
2055
    ; splat function arguments
2056
    SPLATB_REG   flim_E, E_reg, %1, m7 ; E
2057
    SPLATB_REG   flim_I, I_reg, %1, m7 ; I
2058
    SPLATB_REG  hev_thr, hev_thr_reg, %1, m7 ; hev_thresh
2059
%endif
2060

    
2061
%if mmsize == 8 && %4 == 16 ; mmx/mmxext
2062
    mov         cnt_reg, 2
2063
%endif
2064
    mov      stride_reg, mstride_reg
2065
    neg     mstride_reg
2066
%ifidn %2, h
2067
    lea         dst_reg, [dst_reg + stride_reg*4-4]
2068
%if %4 == 8
2069
    lea        dst8_reg, [dst8_reg+ stride_reg*4-4]
2070
%endif
2071
%endif
2072

    
2073
%if mmsize == 8
2074
.next8px
2075
%endif
2076
    ; read
2077
    lea        dst2_reg, [dst_reg + stride_reg]
2078
%ifidn %2, v
2079
%if %4 == 8 && mmsize == 16
2080
%define movrow movh
2081
%else
2082
%define movrow mova
2083
%endif
2084
    movrow           m0, [dst_reg +mstride_reg*4] ; p3
2085
    movrow           m1, [dst2_reg+mstride_reg*4] ; p2
2086
    movrow           m2, [dst_reg +mstride_reg*2] ; p1
2087
    movrow           m5, [dst2_reg]               ; q1
2088
    movrow           m6, [dst2_reg+ stride_reg]   ; q2
2089
    movrow           m7, [dst2_reg+ stride_reg*2] ; q3
2090
%if mmsize == 16 && %4 == 8
2091
    movhps           m0, [dst8_reg+mstride_reg*4]
2092
    movhps           m2, [dst8_reg+mstride_reg*2]
2093
    add        dst8_reg, stride_reg
2094
    movhps           m1, [dst8_reg+mstride_reg*4]
2095
    movhps           m5, [dst8_reg]
2096
    movhps           m6, [dst8_reg+ stride_reg]
2097
    movhps           m7, [dst8_reg+ stride_reg*2]
2098
    add        dst8_reg, mstride_reg
2099
%endif
2100
%elif mmsize == 8 ; mmx/mmxext (h)
2101
    ; read 8 rows of 8px each
2102
    movu             m0, [dst_reg +mstride_reg*4]
2103
    movu             m1, [dst2_reg+mstride_reg*4]
2104
    movu             m2, [dst_reg +mstride_reg*2]
2105
    movu             m3, [dst_reg +mstride_reg]
2106
    movu             m4, [dst_reg]
2107
    movu             m5, [dst2_reg]
2108
    movu             m6, [dst2_reg+ stride_reg]
2109

    
2110
    ; 8x8 transpose
2111
    TRANSPOSE4x4B     0, 1, 2, 3, 7
2112
    mova       q0backup, m1
2113
    movu             m7, [dst2_reg+ stride_reg*2]
2114
    TRANSPOSE4x4B     4, 5, 6, 7, 1
2115
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
2116
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
2117
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
2118
    mova             m1, q0backup
2119
    mova       q0backup, m2          ; store q0
2120
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
2121
    mova       p0backup, m5          ; store p0
2122
    SWAP              1, 4
2123
    SWAP              2, 4
2124
    SWAP              6, 3
2125
    SWAP              5, 3
2126
%else ; sse2 (h)
2127
%if %4 == 16
2128
    lea        dst8_reg, [dst_reg + stride_reg*8]
2129
%endif
2130

    
2131
    ; read 16 rows of 8px each, interleave
2132
    movh             m0, [dst_reg +mstride_reg*4]
2133
    movh             m1, [dst8_reg+mstride_reg*4]
2134
    movh             m2, [dst_reg +mstride_reg*2]
2135
    movh             m5, [dst8_reg+mstride_reg*2]
2136
    movh             m3, [dst_reg +mstride_reg]
2137
    movh             m6, [dst8_reg+mstride_reg]
2138
    movh             m4, [dst_reg]
2139
    movh             m7, [dst8_reg]
2140
    punpcklbw        m0, m1          ; A/I
2141
    punpcklbw        m2, m5          ; C/K
2142
    punpcklbw        m3, m6          ; D/L
2143
    punpcklbw        m4, m7          ; E/M
2144

    
2145
    add        dst8_reg, stride_reg
2146
    movh             m1, [dst2_reg+mstride_reg*4]
2147
    movh             m6, [dst8_reg+mstride_reg*4]
2148
    movh             m5, [dst2_reg]
2149
    movh             m7, [dst8_reg]
2150
    punpcklbw        m1, m6          ; B/J
2151
    punpcklbw        m5, m7          ; F/N
2152
    movh             m6, [dst2_reg+ stride_reg]
2153
    movh             m7, [dst8_reg+ stride_reg]
2154
    punpcklbw        m6, m7          ; G/O
2155

    
2156
    ; 8x16 transpose
2157
    TRANSPOSE4x4B     0, 1, 2, 3, 7
2158
%ifdef m8
2159
    SWAP              1, 8
2160
%else
2161
    mova       q0backup, m1
2162
%endif
2163
    movh             m7, [dst2_reg+ stride_reg*2]
2164
    movh             m1, [dst8_reg+ stride_reg*2]
2165
    punpcklbw        m7, m1          ; H/P
2166
    TRANSPOSE4x4B     4, 5, 6, 7, 1
2167
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
2168
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
2169
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
2170
%ifdef m8
2171
    SWAP              1, 8
2172
    SWAP              2, 8
2173
%else
2174
    mova             m1, q0backup
2175
    mova       q0backup, m2          ; store q0
2176
%endif
2177
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
2178
%ifdef m12
2179
    SWAP              5, 12
2180
%else
2181
    mova       p0backup, m5          ; store p0
2182
%endif
2183
    SWAP              1, 4
2184
    SWAP              2, 4
2185
    SWAP              6, 3
2186
    SWAP              5, 3
2187
%endif
2188

    
2189
    ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
2190
    mova             m4, m1
2191
    SWAP              4, 1
2192
    psubusb          m4, m0          ; p2-p3
2193
    psubusb          m0, m1          ; p3-p2
2194
    por              m0, m4          ; abs(p3-p2)
2195

    
2196
    mova             m4, m2
2197
    SWAP              4, 2
2198
    psubusb          m4, m1          ; p1-p2
2199
    mova       p2backup, m1
2200
    psubusb          m1, m2          ; p2-p1
2201
    por              m1, m4          ; abs(p2-p1)
2202

    
2203
    mova             m4, m6
2204
    SWAP              4, 6
2205
    psubusb          m4, m7          ; q2-q3
2206
    psubusb          m7, m6          ; q3-q2
2207
    por              m7, m4          ; abs(q3-q2)
2208

    
2209
    mova             m4, m5
2210
    SWAP              4, 5
2211
    psubusb          m4, m6          ; q1-q2
2212
    mova       q2backup, m6
2213
    psubusb          m6, m5          ; q2-q1
2214
    por              m6, m4          ; abs(q2-q1)
2215

    
2216
%ifidn %1, mmx
2217
    mova             m4, flim_I
2218
    pxor             m3, m3
2219
    psubusb          m0, m4
2220
    psubusb          m1, m4
2221
    psubusb          m7, m4
2222
    psubusb          m6, m4
2223
    pcmpeqb          m0, m3          ; abs(p3-p2) <= I
2224
    pcmpeqb          m1, m3          ; abs(p2-p1) <= I
2225
    pcmpeqb          m7, m3          ; abs(q3-q2) <= I
2226
    pcmpeqb          m6, m3          ; abs(q2-q1) <= I
2227
    pand             m0, m1
2228
    pand             m7, m6
2229
    pand             m0, m7
2230
%else ; mmxext/sse2
2231
    pmaxub           m0, m1
2232
    pmaxub           m6, m7
2233
    pmaxub           m0, m6
2234
%endif
2235

    
2236
    ; normal_limit and high_edge_variance for p1-p0, q1-q0
2237
    SWAP              7, 3           ; now m7 is zero
2238
%ifidn %2, v
2239
    movrow           m3, [dst_reg +mstride_reg] ; p0
2240
%if mmsize == 16 && %4 == 8
2241
    movhps           m3, [dst8_reg+mstride_reg]
2242
%endif
2243
%elifdef m12
2244
    SWAP              3, 12
2245
%else
2246
    mova             m3, p0backup
2247
%endif
2248

    
2249
    mova             m1, m2
2250
    SWAP              1, 2
2251
    mova             m6, m3
2252
    SWAP              3, 6
2253
    psubusb          m1, m3          ; p1-p0
2254
    psubusb          m6, m2          ; p0-p1
2255
    por              m1, m6          ; abs(p1-p0)
2256
%ifidn %1, mmx
2257
    mova             m6, m1
2258
    psubusb          m1, m4
2259
    psubusb          m6, hev_thr
2260
    pcmpeqb          m1, m7          ; abs(p1-p0) <= I
2261
    pcmpeqb          m6, m7          ; abs(p1-p0) <= hev_thresh
2262
    pand             m0, m1
2263
    mova       mask_res, m6
2264
%else ; mmxext/sse2
2265
    pmaxub           m0, m1          ; max_I
2266
    SWAP              1, 4           ; max_hev_thresh
2267
%endif
2268

    
2269
    SWAP              6, 4           ; now m6 is I
2270
%ifidn %2, v
2271
    movrow           m4, [dst_reg]   ; q0
2272
%if mmsize == 16 && %4 == 8
2273
    movhps           m4, [dst8_reg]
2274
%endif
2275
%elifdef m8
2276
    SWAP              4, 8
2277
%else
2278
    mova             m4, q0backup
2279
%endif
2280
    mova             m1, m4
2281
    SWAP              1, 4
2282
    mova             m7, m5
2283
    SWAP              7, 5
2284
    psubusb          m1, m5          ; q0-q1
2285
    psubusb          m7, m4          ; q1-q0
2286
    por              m1, m7          ; abs(q1-q0)
2287
%ifidn %1, mmx
2288
    mova             m7, m1
2289
    psubusb          m1, m6
2290
    psubusb          m7, hev_thr
2291
    pxor             m6, m6
2292
    pcmpeqb          m1, m6          ; abs(q1-q0) <= I
2293
    pcmpeqb          m7, m6          ; abs(q1-q0) <= hev_thresh
2294
    mova             m6, mask_res
2295
    pand             m0, m1          ; abs([pq][321]-[pq][210]) <= I
2296
    pand             m6, m7
2297
%else ; mmxext/sse2
2298
    pxor             m7, m7
2299
    pmaxub           m0, m1
2300
    pmaxub           m6, m1
2301
    psubusb          m0, flim_I
2302
    psubusb          m6, hev_thr
2303
    pcmpeqb          m0, m7          ; max(abs(..)) <= I
2304
    pcmpeqb          m6, m7          ; !(max(abs..) > thresh)
2305
%endif
2306
%ifdef m12
2307
    SWAP              6, 12
2308
%else
2309
    mova       mask_res, m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
2310
%endif
2311

    
2312
    ; simple_limit
2313
    mova             m1, m3
2314
    SWAP              1, 3
2315
    mova             m6, m4          ; keep copies of p0/q0 around for later use
2316
    SWAP              6, 4
2317
    psubusb          m1, m4          ; p0-q0
2318
    psubusb          m6, m3          ; q0-p0
2319
    por              m1, m6          ; abs(q0-p0)
2320
    paddusb          m1, m1          ; m1=2*abs(q0-p0)
2321

    
2322
    mova             m7, m2
2323
    SWAP              7, 2
2324
    mova             m6, m5
2325
    SWAP              6, 5
2326
    psubusb          m7, m5          ; p1-q1
2327
    psubusb          m6, m2          ; q1-p1
2328
    por              m7, m6          ; abs(q1-p1)
2329
    pxor             m6, m6
2330
    pand             m7, [pb_FE]
2331
    psrlq            m7, 1           ; abs(q1-p1)/2
2332
    paddusb          m7, m1          ; abs(q0-p0)*2+abs(q1-p1)/2
2333
    psubusb          m7, flim_E
2334
    pcmpeqb          m7, m6          ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
2335
    pand             m0, m7          ; normal_limit result
2336

    
2337
    ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
2338
%ifdef m8 ; x86-64 && sse2
2339
    mova             m8, [pb_80]
2340
%define pb_80_var m8
2341
%else ; x86-32 or mmx/mmxext
2342
%define pb_80_var [pb_80]
2343
%endif
2344
    mova             m1, m4
2345
    mova             m7, m3
2346
    pxor             m1, pb_80_var
2347
    pxor             m7, pb_80_var
2348
    psubsb           m1, m7          ; (signed) q0-p0
2349
    mova             m6, m2
2350
    mova             m7, m5
2351
    pxor             m6, pb_80_var
2352
    pxor             m7, pb_80_var
2353
    psubsb           m6, m7          ; (signed) p1-q1
2354
    mova             m7, mask_res
2355
    paddsb           m6, m1
2356
    paddsb           m6, m1
2357
    paddsb           m6, m1
2358
    pand             m6, m0
2359
%ifdef m8
2360
    mova        lim_res, m6          ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
2361
    pand        lim_res, m7
2362
%else
2363
    mova             m0, m6
2364
    pand             m0, m7
2365
    mova        lim_res, m0
2366
%endif
2367
    pandn            m7, m6          ; 3*(q0-p0)+(p1-q1) masked for filter_common
2368

    
2369
    mova             m1, [pb_F8]
2370
    mova             m6, m7
2371
    paddsb           m7, [pb_3]
2372
    paddsb           m6, [pb_4]
2373
    pand             m7, m1
2374
    pand             m6, m1
2375

    
2376
    pxor             m1, m1
2377
    pxor             m0, m0
2378
    pcmpgtb          m1, m7
2379
    psubb            m0, m7
2380
    psrlq            m7, 3           ; +f2
2381
    psrlq            m0, 3           ; -f2
2382
    pand             m0, m1
2383
    pandn            m1, m7
2384
    psubusb          m3, m0
2385
    paddusb          m3, m1          ; p0+f2
2386

    
2387
    pxor             m1, m1
2388
    pxor             m0, m0
2389
    pcmpgtb          m0, m6
2390
    psubb            m1, m6
2391
    psrlq            m6, 3           ; +f1
2392
    psrlq            m1, 3           ; -f1
2393
    pand             m1, m0
2394
    pandn            m0, m6
2395
    psubusb          m4, m0
2396
    paddusb          m4, m1          ; q0-f1
2397

    
2398
    ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
2399
    mova             m7, [pw_63]
2400
%ifdef m8
2401
    SWAP              1, 8
2402
%else
2403
    mova             m1, lim_res
2404
%endif
2405
    pxor             m0, m0
2406
    mova             m6, m1
2407
    pcmpgtb          m0, m1         ; which are negative
2408
    punpcklbw        m6, m0         ; signed byte->word
2409
    punpckhbw        m1, m0
2410
    mova       lim_sign, m0
2411
    mova       mask_res, m6         ; backup for later in filter
2412
    mova        lim_res, m1
2413
    pmullw          m6, [pw_27]
2414
    pmullw          m1, [pw_27]
2415
    paddw           m6, m7
2416
    paddw           m1, m7
2417
    psraw           m6, 7
2418
    psraw           m1, 7
2419
    packsswb        m6, m1          ; a0
2420
    pxor            m1, m1
2421
    psubb           m1, m6
2422
    pand            m1, m0          ; -a0
2423
    pandn           m0, m6          ; +a0
2424
    psubusb         m3, m1
2425
    paddusb         m4, m1
2426
    paddusb         m3, m0          ; p0+a0
2427
    psubusb         m4, m0          ; q0-a0
2428

    
2429
    mova            m6, mask_res
2430
    mova            m1, lim_res
2431
    mova            m0, lim_sign
2432
    pmullw          m6, [pw_18]
2433
    pmullw          m1, [pw_18]
2434
    paddw           m6, m7
2435
    paddw           m1, m7
2436
    psraw           m6, 7
2437
    psraw           m1, 7
2438
    packsswb        m6, m1          ; a1
2439
    pxor            m1, m1
2440
    psubb           m1, m6
2441
    pand            m1, m0          ; -a1
2442
    pandn           m0, m6          ; +a1
2443
    psubusb         m2, m1
2444
    paddusb         m5, m1
2445
    paddusb         m2, m0          ; p1+a1
2446
    psubusb         m5, m0          ; q1-a1
2447

    
2448
%ifdef m8
2449
    SWAP             6, 12
2450
    SWAP             1, 8
2451
%else
2452
    mova            m6, mask_res
2453
    mova            m1, lim_res
2454
%endif
2455
    pmullw          m6, [pw_9]
2456
    pmullw          m1, [pw_9]
2457
    paddw           m6, m7
2458
    paddw           m1, m7
2459
%ifdef m15
2460
    SWAP             7, 15
2461
%else
2462
    mova            m7, lim_sign
2463
%endif
2464
    psraw           m6, 7
2465
    psraw           m1, 7
2466
    packsswb        m6, m1          ; a1
2467
    pxor            m0, m0
2468
    psubb           m0, m6
2469
    pand            m0, m7          ; -a1
2470
    pandn           m7, m6          ; +a1
2471
%ifdef m8
2472
    SWAP             1, 13
2473
    SWAP             6, 14
2474
%else
2475
    mova            m1, p2backup
2476
    mova            m6, q2backup
2477
%endif
2478
    psubusb         m1, m0
2479
    paddusb         m6, m0
2480
    paddusb         m1, m7          ; p1+a1
2481
    psubusb         m6, m7          ; q1-a1
2482

    
2483
    ; store
2484
%ifidn %2, v
2485
    movrow [dst2_reg+mstride_reg*4], m1
2486
    movrow [dst_reg +mstride_reg*2], m2
2487
    movrow [dst_reg +mstride_reg  ], m3
2488
    movrow    [dst_reg], m4
2489
    movrow   [dst2_reg], m5
2490
    movrow [dst2_reg+ stride_reg  ], m6
2491
%if mmsize == 16 && %4 == 8
2492
    add        dst8_reg, mstride_reg
2493
    movhps [dst8_reg+mstride_reg*2], m1
2494
    movhps [dst8_reg+mstride_reg  ], m2
2495
    movhps   [dst8_reg], m3
2496
    add        dst8_reg, stride_reg
2497
    movhps   [dst8_reg], m4
2498
    movhps [dst8_reg+ stride_reg  ], m5
2499
    movhps [dst8_reg+ stride_reg*2], m6
2500
%endif
2501
%else ; h
2502
    inc         dst_reg
2503
    inc        dst2_reg
2504

    
2505
    ; 4x8/16 transpose
2506
    TRANSPOSE4x4B     1, 2, 3, 4, 0
2507
    SBUTTERFLY       bw, 5, 6, 0
2508

    
2509
%if mmsize == 8 ; mmx/mmxext (h)
2510
    WRITE_4x2D        1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg
2511
    add         dst_reg, 4
2512
    WRITE_8W         m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg
2513
%else ; sse2 (h)
2514
    lea        dst8_reg, [dst8_reg+mstride_reg+1]
2515
    WRITE_4x4D        1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
2516
    lea         dst_reg, [dst2_reg+mstride_reg+4]
2517
    lea        dst8_reg, [dst8_reg+mstride_reg+4]
2518
    WRITE_8W         m5, m5, dst2_reg, dst_reg,  mstride_reg, stride_reg
2519
    WRITE_8W         m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg
2520
%endif
2521
%endif
2522

    
2523
%if mmsize == 8
2524
%if %4 == 8 ; chroma
2525
%ifidn %2, h
2526
    sub         dst_reg, 5
2527
%endif
2528
    cmp         dst_reg, dst8_reg
2529
    mov         dst_reg, dst8_reg
2530
    jnz .next8px
2531
%else
2532
%ifidn %2, h
2533
    lea         dst_reg, [dst_reg + stride_reg*8-5]
2534
%else ; v
2535
    add         dst_reg, 8
2536
%endif
2537
    dec         cnt_reg
2538
    jg .next8px
2539
%endif
2540
%endif
2541

    
2542
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
2543
    mov             rsp, stack_reg   ; restore stack pointer
2544
%endif
2545
    RET
2546
%endmacro
2547

    
2548
INIT_MMX
2549
MBEDGE_LOOPFILTER mmx,    v, 6, 16, 0
2550
MBEDGE_LOOPFILTER mmx,    h, 6, 16, 0
2551
MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0
2552
MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0
2553

    
2554
MBEDGE_LOOPFILTER mmx,    v, 6,  8, 0
2555
MBEDGE_LOOPFILTER mmx,    h, 6,  8, 0
2556
MBEDGE_LOOPFILTER mmxext, v, 6,  8, 0
2557
MBEDGE_LOOPFILTER mmxext, h, 6,  8, 0
2558

    
2559
INIT_XMM
2560
MBEDGE_LOOPFILTER sse2,   v, 5, 16, 16
2561
%ifdef m8
2562
MBEDGE_LOOPFILTER sse2,   h, 5, 16, 16
2563
%else
2564
MBEDGE_LOOPFILTER sse2,   h, 6, 16, 16
2565
%endif
2566
MBEDGE_LOOPFILTER sse2,   v, 6,  8, 16
2567
MBEDGE_LOOPFILTER sse2,   h, 6,  8, 16
2568

    
2569
MBEDGE_LOOPFILTER ssse3,  v, 5, 16, 16
2570
%ifdef m8
2571
MBEDGE_LOOPFILTER ssse3,  h, 5, 16, 16
2572
%else
2573
MBEDGE_LOOPFILTER ssse3,  h, 6, 16, 16
2574
%endif
2575
MBEDGE_LOOPFILTER ssse3,  v, 6,  8, 16
2576
MBEDGE_LOOPFILTER ssse3,  h, 6,  8, 16