Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / vp8dsp.asm @ 268821e7

History | View | Annotate | Download (53.3 KB)

1
;******************************************************************************
2
;* VP8 MMXEXT optimizations
3
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5
;*
6
;* This file is part of FFmpeg.
7
;*
8
;* FFmpeg is free software; you can redistribute it and/or
9
;* modify it under the terms of the GNU Lesser General Public
10
;* License as published by the Free Software Foundation; either
11
;* version 2.1 of the License, or (at your option) any later version.
12
;*
13
;* FFmpeg is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
;* Lesser General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU Lesser General Public
19
;* License along with FFmpeg; if not, write to the Free Software
20
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
;******************************************************************************
22

    
23
%include "x86inc.asm"
24
%include "x86util.asm"
25

    
26
SECTION_RODATA
27

    
28
fourtap_filter_hw_m: times 4 dw  -6, 123
29
                     times 4 dw  12,  -1
30
                     times 4 dw  -9,  93
31
                     times 4 dw  50,  -6
32
                     times 4 dw  -6,  50
33
                     times 4 dw  93,  -9
34
                     times 4 dw  -1,  12
35
                     times 4 dw 123,  -6
36

    
37
sixtap_filter_hw_m:  times 4 dw   2, -11
38
                     times 4 dw 108,  36
39
                     times 4 dw  -8,   1
40
                     times 4 dw   3, -16
41
                     times 4 dw  77,  77
42
                     times 4 dw -16,   3
43
                     times 4 dw   1,  -8
44
                     times 4 dw  36, 108
45
                     times 4 dw -11,   2
46

    
47
fourtap_filter_hb_m: times 8 db  -6, 123
48
                     times 8 db  12,  -1
49
                     times 8 db  -9,  93
50
                     times 8 db  50,  -6
51
                     times 8 db  -6,  50
52
                     times 8 db  93,  -9
53
                     times 8 db  -1,  12
54
                     times 8 db 123,  -6
55

    
56
sixtap_filter_hb_m:  times 8 db   2,   1
57
                     times 8 db -11, 108
58
                     times 8 db  36,  -8
59
                     times 8 db   3,   3
60
                     times 8 db -16,  77
61
                     times 8 db  77, -16
62
                     times 8 db   1,   2
63
                     times 8 db  -8,  36
64
                     times 8 db 108, -11
65

    
66
fourtap_filter_v_m:  times 8 dw  -6
67
                     times 8 dw 123
68
                     times 8 dw  12
69
                     times 8 dw  -1
70
                     times 8 dw  -9
71
                     times 8 dw  93
72
                     times 8 dw  50
73
                     times 8 dw  -6
74
                     times 8 dw  -6
75
                     times 8 dw  50
76
                     times 8 dw  93
77
                     times 8 dw  -9
78
                     times 8 dw  -1
79
                     times 8 dw  12
80
                     times 8 dw 123
81
                     times 8 dw  -6
82

    
83
sixtap_filter_v_m:   times 8 dw   2
84
                     times 8 dw -11
85
                     times 8 dw 108
86
                     times 8 dw  36
87
                     times 8 dw  -8
88
                     times 8 dw   1
89
                     times 8 dw   3
90
                     times 8 dw -16
91
                     times 8 dw  77
92
                     times 8 dw  77
93
                     times 8 dw -16
94
                     times 8 dw   3
95
                     times 8 dw   1
96
                     times 8 dw  -8
97
                     times 8 dw  36
98
                     times 8 dw 108
99
                     times 8 dw -11
100
                     times 8 dw   2
101

    
102
bilinear_filter_vw_m: times 8 dw 1
103
                      times 8 dw 2
104
                      times 8 dw 3
105
                      times 8 dw 4
106
                      times 8 dw 5
107
                      times 8 dw 6
108
                      times 8 dw 7
109

    
110
bilinear_filter_vb_m: times 8 db 7, 1
111
                      times 8 db 6, 2
112
                      times 8 db 5, 3
113
                      times 8 db 4, 4
114
                      times 8 db 3, 5
115
                      times 8 db 2, 6
116
                      times 8 db 1, 7
117

    
118
%ifdef PIC
119
%define fourtap_filter_hw    r11
120
%define sixtap_filter_hw     r11
121
%define fourtap_filter_hb    r11
122
%define sixtap_filter_hb     r11
123
%define fourtap_filter_v     r11
124
%define sixtap_filter_v      r11
125
%define bilinear_filter_vw   r11
126
%define bilinear_filter_vb   r11
127
%else
128
%define fourtap_filter_hw fourtap_filter_hw_m
129
%define sixtap_filter_hw  sixtap_filter_hw_m
130
%define fourtap_filter_hb fourtap_filter_hb_m
131
%define sixtap_filter_hb  sixtap_filter_hb_m
132
%define fourtap_filter_v  fourtap_filter_v_m
133
%define sixtap_filter_v   sixtap_filter_v_m
134
%define bilinear_filter_vw bilinear_filter_vw_m
135
%define bilinear_filter_vb bilinear_filter_vb_m
136
%endif
137

    
138
filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
139
filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
140

    
141
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
142
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
143
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
144

    
145
pw_20091: times 4 dw 20091
146
pw_17734: times 4 dw 17734
147

    
148
cextern pb_1
149
cextern pw_3
150
cextern pb_3
151
cextern pw_4
152
cextern pb_4
153
cextern pw_64
154
cextern pb_80
155
cextern pb_F8
156
cextern pb_FE
157

    
158
SECTION .text
159

    
160
;-----------------------------------------------------------------------------
161
; subpel MC functions:
162
;
163
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
164
;                                              uint8_t *src, int srcstride,
165
;                                              int height,   int mx, int my);
166
;-----------------------------------------------------------------------------
167

    
168
%macro FILTER_SSSE3 3
169
cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
170
    lea      r5d, [r5*3]
171
    mova      m3, [filter_h6_shuf2]
172
    mova      m4, [filter_h6_shuf3]
173
%ifdef PIC
174
    lea      r11, [sixtap_filter_hb_m]
175
%endif
176
    mova      m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
177
    mova      m6, [sixtap_filter_hb+r5*8-32]
178
    mova      m7, [sixtap_filter_hb+r5*8-16]
179

    
180
.nextrow
181
    movu      m0, [r2-2]
182
    mova      m1, m0
183
    mova      m2, m0
184
%ifidn %1, 4
185
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
186
; shuffle with a memory operand
187
    punpcklbw m0, [r2+3]
188
%else
189
    pshufb    m0, [filter_h6_shuf1]
190
%endif
191
    pshufb    m1, m3
192
    pshufb    m2, m4
193
    pmaddubsw m0, m5
194
    pmaddubsw m1, m6
195
    pmaddubsw m2, m7
196
    paddsw    m0, m1
197
    paddsw    m0, m2
198
    paddsw    m0, [pw_64]
199
    psraw     m0, 7
200
    packuswb  m0, m0
201
    movh    [r0], m0        ; store
202

    
203
    ; go to next line
204
    add       r0, r1
205
    add       r2, r3
206
    dec       r4            ; next row
207
    jg .nextrow
208
    REP_RET
209

    
210
cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
211
    shl      r5d, 4
212
    mova      m2, [pw_64]
213
    mova      m3, [filter_h2_shuf]
214
    mova      m4, [filter_h4_shuf]
215
%ifdef PIC
216
    lea      r11, [fourtap_filter_hb_m]
217
%endif
218
    mova      m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
219
    mova      m6, [fourtap_filter_hb+r5]
220

    
221
.nextrow
222
    movu      m0, [r2-1]
223
    mova      m1, m0
224
    pshufb    m0, m3
225
    pshufb    m1, m4
226
    pmaddubsw m0, m5
227
    pmaddubsw m1, m6
228
    paddsw    m0, m2
229
    paddsw    m0, m1
230
    psraw     m0, 7
231
    packuswb  m0, m0
232
    movh    [r0], m0        ; store
233

    
234
    ; go to next line
235
    add       r0, r1
236
    add       r2, r3
237
    dec       r4            ; next row
238
    jg .nextrow
239
    REP_RET
240

    
241
cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
242
    shl      r6d, 4
243
%ifdef PIC
244
    lea      r11, [fourtap_filter_hb_m]
245
%endif
246
    mova      m5, [fourtap_filter_hb+r6-16]
247
    mova      m6, [fourtap_filter_hb+r6]
248
    mova      m7, [pw_64]
249

    
250
    ; read 3 lines
251
    sub       r2, r3
252
    movh      m0, [r2]
253
    movh      m1, [r2+  r3]
254
    movh      m2, [r2+2*r3]
255
    add       r2, r3
256

    
257
.nextrow
258
    movh      m3, [r2+2*r3]                ; read new row
259
    mova      m4, m0
260
    mova      m0, m1
261
    punpcklbw m4, m1
262
    mova      m1, m2
263
    punpcklbw m2, m3
264
    pmaddubsw m4, m5
265
    pmaddubsw m2, m6
266
    paddsw    m4, m2
267
    mova      m2, m3
268
    paddsw    m4, m7
269
    psraw     m4, 7
270
    packuswb  m4, m4
271
    movh    [r0], m4
272

    
273
    ; go to next line
274
    add        r0, r1
275
    add        r2, r3
276
    dec        r4                          ; next row
277
    jg .nextrow
278
    REP_RET
279

    
280
cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
281
    lea      r6d, [r6*3]
282
%ifdef PIC
283
    lea      r11, [sixtap_filter_hb_m]
284
%endif
285
    lea       r6, [sixtap_filter_hb+r6*8]
286

    
287
    ; read 5 lines
288
    sub       r2, r3
289
    sub       r2, r3
290
    movh      m0, [r2]
291
    movh      m1, [r2+r3]
292
    movh      m2, [r2+r3*2]
293
    lea       r2, [r2+r3*2]
294
    add       r2, r3
295
    movh      m3, [r2]
296
    movh      m4, [r2+r3]
297

    
298
.nextrow
299
    movh      m5, [r2+2*r3]                ; read new row
300
    mova      m6, m0
301
    punpcklbw m6, m5
302
    mova      m0, m1
303
    punpcklbw m1, m2
304
    mova      m7, m3
305
    punpcklbw m7, m4
306
    pmaddubsw m6, [r6-48]
307
    pmaddubsw m1, [r6-32]
308
    pmaddubsw m7, [r6-16]
309
    paddsw    m6, m1
310
    paddsw    m6, m7
311
    mova      m1, m2
312
    paddsw    m6, [pw_64]
313
    mova      m2, m3
314
    psraw     m6, 7
315
    mova      m3, m4
316
    packuswb  m6, m6
317
    mova      m4, m5
318
    movh    [r0], m6
319

    
320
    ; go to next line
321
    add        r0, r1
322
    add        r2, r3
323
    dec        r4                          ; next row
324
    jg .nextrow
325
    REP_RET
326
%endmacro
327

    
328
INIT_MMX
329
FILTER_SSSE3 4, 0, 0
330
INIT_XMM
331
FILTER_SSSE3 8, 8, 7
332

    
333
; 4x4 block, H-only 4-tap filter
334
cglobal put_vp8_epel4_h4_mmxext, 6, 6
335
    shl       r5d, 4
336
%ifdef PIC
337
    lea       r11, [fourtap_filter_hw_m]
338
%endif
339
    movq      mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
340
    movq      mm5, [fourtap_filter_hw+r5]
341
    movq      mm7, [pw_64]
342
    pxor      mm6, mm6
343

    
344
.nextrow
345
    movq      mm1, [r2-1]                  ; (ABCDEFGH) load 8 horizontal pixels
346

    
347
    ; first set of 2 pixels
348
    movq      mm2, mm1                     ; byte ABCD..
349
    punpcklbw mm1, mm6                     ; byte->word ABCD
350
    pshufw    mm0, mm2, 9                  ; byte CDEF..
351
    punpcklbw mm0, mm6                     ; byte->word CDEF
352
    pshufw    mm3, mm1, 0x94               ; word ABBC
353
    pshufw    mm1, mm0, 0x94               ; word CDDE
354
    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
355
    movq      mm0, mm1                     ; backup for second set of pixels
356
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
357
    paddd     mm3, mm1                     ; finish 1st 2px
358

    
359
    ; second set of 2 pixels, use backup of above
360
    punpckhbw mm2, mm6                     ; byte->word EFGH
361
    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
362
    pshufw    mm1, mm2, 0x94               ; word EFFG
363
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
364
    paddd     mm0, mm1                     ; finish 2nd 2px
365

    
366
    ; merge two sets of 2 pixels into one set of 4, round/clip/store
367
    packssdw  mm3, mm0                     ; merge dword->word (4px)
368
    paddsw    mm3, mm7                     ; rounding
369
    psraw     mm3, 7
370
    packuswb  mm3, mm6                     ; clip and word->bytes
371
    movd     [r0], mm3                     ; store
372

    
373
    ; go to next line
374
    add        r0, r1
375
    add        r2, r3
376
    dec        r4                          ; next row
377
    jg .nextrow
378
    REP_RET
379

    
380
; 4x4 block, H-only 6-tap filter
381
cglobal put_vp8_epel4_h6_mmxext, 6, 6
382
    lea       r5d, [r5*3]
383
%ifdef PIC
384
    lea       r11, [sixtap_filter_hw_m]
385
%endif
386
    movq      mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
387
    movq      mm5, [sixtap_filter_hw+r5*8-32]
388
    movq      mm6, [sixtap_filter_hw+r5*8-16]
389
    movq      mm7, [pw_64]
390
    pxor      mm3, mm3
391

    
392
.nextrow
393
    movq      mm1, [r2-2]                  ; (ABCDEFGH) load 8 horizontal pixels
394

    
395
    ; first set of 2 pixels
396
    movq      mm2, mm1                     ; byte ABCD..
397
    punpcklbw mm1, mm3                     ; byte->word ABCD
398
    pshufw    mm0, mm2, 0x9                ; byte CDEF..
399
    punpckhbw mm2, mm3                     ; byte->word EFGH
400
    punpcklbw mm0, mm3                     ; byte->word CDEF
401
    pshufw    mm1, mm1, 0x94               ; word ABBC
402
    pshufw    mm2, mm2, 0x94               ; word EFFG
403
    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
404
    pshufw    mm3, mm0, 0x94               ; word CDDE
405
    movq      mm0, mm3                     ; backup for second set of pixels
406
    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
407
    paddd     mm1, mm3                     ; add to 1st 2px cache
408
    movq      mm3, mm2                     ; backup for second set of pixels
409
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
410
    paddd     mm1, mm2                     ; finish 1st 2px
411

    
412
    ; second set of 2 pixels, use backup of above
413
    movd      mm2, [r2+3]                  ; byte FGHI (prevent overreads)
414
    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
415
    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
416
    paddd     mm0, mm3                     ; add to 2nd 2px cache
417
    pxor      mm3, mm3
418
    punpcklbw mm2, mm3                     ; byte->word FGHI
419
    pshufw    mm2, mm2, 0xE9               ; word GHHI
420
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
421
    paddd     mm0, mm2                     ; finish 2nd 2px
422

    
423
    ; merge two sets of 2 pixels into one set of 4, round/clip/store
424
    packssdw  mm1, mm0                     ; merge dword->word (4px)
425
    paddsw    mm1, mm7                     ; rounding
426
    psraw     mm1, 7
427
    packuswb  mm1, mm3                     ; clip and word->bytes
428
    movd     [r0], mm1                     ; store
429

    
430
    ; go to next line
431
    add        r0, r1
432
    add        r2, r3
433
    dec        r4                          ; next row
434
    jg .nextrow
435
    REP_RET
436

    
437
; 4x4 block, H-only 4-tap filter
438
INIT_XMM
439
cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
440
    shl      r5d, 4
441
%ifdef PIC
442
    lea      r11, [fourtap_filter_hw_m]
443
%endif
444
    mova      m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
445
    mova      m6, [fourtap_filter_hw+r5]
446
    pxor      m7, m7
447

    
448
.nextrow
449
    movh      m0, [r2-1]
450
    punpcklbw m0, m7        ; ABCDEFGH
451
    mova      m1, m0
452
    mova      m2, m0
453
    mova      m3, m0
454
    psrldq    m1, 2         ; BCDEFGH
455
    psrldq    m2, 4         ; CDEFGH
456
    psrldq    m3, 6         ; DEFGH
457
    punpcklwd m0, m1        ; ABBCCDDE
458
    punpcklwd m2, m3        ; CDDEEFFG
459
    pmaddwd   m0, m5
460
    pmaddwd   m2, m6
461
    paddd     m0, m2
462

    
463
    movh      m1, [r2+3]
464
    punpcklbw m1, m7        ; ABCDEFGH
465
    mova      m2, m1
466
    mova      m3, m1
467
    mova      m4, m1
468
    psrldq    m2, 2         ; BCDEFGH
469
    psrldq    m3, 4         ; CDEFGH
470
    psrldq    m4, 6         ; DEFGH
471
    punpcklwd m1, m2        ; ABBCCDDE
472
    punpcklwd m3, m4        ; CDDEEFFG
473
    pmaddwd   m1, m5
474
    pmaddwd   m3, m6
475
    paddd     m1, m3
476

    
477
    packssdw  m0, m1
478
    paddsw    m0, [pw_64]
479
    psraw     m0, 7
480
    packuswb  m0, m7
481
    movh    [r0], m0        ; store
482

    
483
    ; go to next line
484
    add       r0, r1
485
    add       r2, r3
486
    dec       r4            ; next row
487
    jg .nextrow
488
    REP_RET
489

    
490
cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
491
    lea      r5d, [r5*3]
492
%ifdef PIC
493
    lea      r11, [sixtap_filter_hw_m]
494
%endif
495
    lea       r5, [sixtap_filter_hw+r5*8]
496
    pxor      m7, m7
497

    
498
.nextrow
499
    movu      m0, [r2-2]
500
    mova      m6, m0
501
    mova      m4, m0
502
    punpcklbw m0, m7        ; ABCDEFGHI
503
    mova      m1, m0
504
    mova      m2, m0
505
    mova      m3, m0
506
    psrldq    m1, 2         ; BCDEFGH
507
    psrldq    m2, 4         ; CDEFGH
508
    psrldq    m3, 6         ; DEFGH
509
    psrldq    m4, 4
510
    punpcklbw m4, m7        ; EFGH
511
    mova      m5, m4
512
    psrldq    m5, 2         ; FGH
513
    punpcklwd m0, m1        ; ABBCCDDE
514
    punpcklwd m2, m3        ; CDDEEFFG
515
    punpcklwd m4, m5        ; EFFGGHHI
516
    pmaddwd   m0, [r5-48]
517
    pmaddwd   m2, [r5-32]
518
    pmaddwd   m4, [r5-16]
519
    paddd     m0, m2
520
    paddd     m0, m4
521

    
522
    psrldq    m6, 4
523
    mova      m4, m6
524
    punpcklbw m6, m7        ; ABCDEFGHI
525
    mova      m1, m6
526
    mova      m2, m6
527
    mova      m3, m6
528
    psrldq    m1, 2         ; BCDEFGH
529
    psrldq    m2, 4         ; CDEFGH
530
    psrldq    m3, 6         ; DEFGH
531
    psrldq    m4, 4
532
    punpcklbw m4, m7        ; EFGH
533
    mova      m5, m4
534
    psrldq    m5, 2         ; FGH
535
    punpcklwd m6, m1        ; ABBCCDDE
536
    punpcklwd m2, m3        ; CDDEEFFG
537
    punpcklwd m4, m5        ; EFFGGHHI
538
    pmaddwd   m6, [r5-48]
539
    pmaddwd   m2, [r5-32]
540
    pmaddwd   m4, [r5-16]
541
    paddd     m6, m2
542
    paddd     m6, m4
543

    
544
    packssdw  m0, m6
545
    paddsw    m0, [pw_64]
546
    psraw     m0, 7
547
    packuswb  m0, m7
548
    movh    [r0], m0        ; store
549

    
550
    ; go to next line
551
    add       r0, r1
552
    add       r2, r3
553
    dec       r4            ; next row
554
    jg .nextrow
555
    REP_RET
556

    
557
%macro FILTER_V 3
558
; 4x4 block, V-only 4-tap filter
559
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
560
    shl      r6d, 5
561
%ifdef PIC
562
    lea      r11, [fourtap_filter_v_m]
563
%endif
564
    lea       r6, [fourtap_filter_v+r6-32]
565
    mova      m6, [pw_64]
566
    pxor      m7, m7
567
    mova      m5, [r6+48]
568

    
569
    ; read 3 lines
570
    sub       r2, r3
571
    movh      m0, [r2]
572
    movh      m1, [r2+  r3]
573
    movh      m2, [r2+2*r3]
574
    add       r2, r3
575
    punpcklbw m0, m7
576
    punpcklbw m1, m7
577
    punpcklbw m2, m7
578

    
579
.nextrow
580
    ; first calculate negative taps (to prevent losing positive overflows)
581
    movh      m4, [r2+2*r3]                ; read new row
582
    punpcklbw m4, m7
583
    mova      m3, m4
584
    pmullw    m0, [r6+0]
585
    pmullw    m4, m5
586
    paddsw    m4, m0
587

    
588
    ; then calculate positive taps
589
    mova      m0, m1
590
    pmullw    m1, [r6+16]
591
    paddsw    m4, m1
592
    mova      m1, m2
593
    pmullw    m2, [r6+32]
594
    paddsw    m4, m2
595
    mova      m2, m3
596

    
597
    ; round/clip/store
598
    paddsw    m4, m6
599
    psraw     m4, 7
600
    packuswb  m4, m7
601
    movh    [r0], m4
602

    
603
    ; go to next line
604
    add       r0, r1
605
    add       r2, r3
606
    dec       r4                           ; next row
607
    jg .nextrow
608
    REP_RET
609

    
610

    
611
; 4x4 block, V-only 6-tap filter
612
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
613
    shl      r6d, 4
614
    lea       r6, [r6*3]
615
%ifdef PIC
616
    lea      r11, [sixtap_filter_v_m]
617
%endif
618
    lea       r6, [sixtap_filter_v+r6-96]
619
    pxor      m7, m7
620

    
621
    ; read 5 lines
622
    sub       r2, r3
623
    sub       r2, r3
624
    movh      m0, [r2]
625
    movh      m1, [r2+r3]
626
    movh      m2, [r2+r3*2]
627
    lea       r2, [r2+r3*2]
628
    add       r2, r3
629
    movh      m3, [r2]
630
    movh      m4, [r2+r3]
631
    punpcklbw m0, m7
632
    punpcklbw m1, m7
633
    punpcklbw m2, m7
634
    punpcklbw m3, m7
635
    punpcklbw m4, m7
636

    
637
.nextrow
638
    ; first calculate negative taps (to prevent losing positive overflows)
639
    mova      m5, m1
640
    pmullw    m5, [r6+16]
641
    mova      m6, m4
642
    pmullw    m6, [r6+64]
643
    paddsw    m6, m5
644

    
645
    ; then calculate positive taps
646
    movh      m5, [r2+2*r3]                ; read new row
647
    punpcklbw m5, m7
648
    pmullw    m0, [r6+0]
649
    paddsw    m6, m0
650
    mova      m0, m1
651
    mova      m1, m2
652
    pmullw    m2, [r6+32]
653
    paddsw    m6, m2
654
    mova      m2, m3
655
    pmullw    m3, [r6+48]
656
    paddsw    m6, m3
657
    mova      m3, m4
658
    mova      m4, m5
659
    pmullw    m5, [r6+80]
660
    paddsw    m6, m5
661

    
662
    ; round/clip/store
663
    paddsw    m6, [pw_64]
664
    psraw     m6, 7
665
    packuswb  m6, m7
666
    movh    [r0], m6
667

    
668
    ; go to next line
669
    add       r0, r1
670
    add       r2, r3
671
    dec       r4                           ; next row
672
    jg .nextrow
673
    REP_RET
674
%endmacro
675

    
676
INIT_MMX
677
FILTER_V mmxext, 4, 0
678
INIT_XMM
679
FILTER_V sse2,   8, 8
680

    
681
%macro FILTER_BILINEAR 3
682
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
683
    mov      r5d, 8*16
684
    shl      r6d, 4
685
    sub      r5d, r6d
686
%ifdef PIC
687
    lea      r11, [bilinear_filter_vw_m]
688
%endif
689
    pxor      m6, m6
690
    mova      m4, [bilinear_filter_vw+r5-16]
691
    mova      m5, [bilinear_filter_vw+r6-16]
692
.nextrow
693
    movh      m0, [r2+r3*0]
694
    movh      m1, [r2+r3*1]
695
    movh      m3, [r2+r3*2]
696
    punpcklbw m0, m6
697
    punpcklbw m1, m6
698
    punpcklbw m3, m6
699
    mova      m2, m1
700
    pmullw    m0, m4
701
    pmullw    m1, m5
702
    pmullw    m2, m4
703
    pmullw    m3, m5
704
    paddsw    m0, m1
705
    paddsw    m2, m3
706
    psraw     m0, 2
707
    psraw     m2, 2
708
    pavgw     m0, m6
709
    pavgw     m2, m6
710
%ifidn %1, mmxext
711
    packuswb  m0, m0
712
    packuswb  m2, m2
713
    movh [r0+r1*0], m0
714
    movh [r0+r1*1], m2
715
%else
716
    packuswb  m0, m2
717
    movh   [r0+r1*0], m0
718
    movhps [r0+r1*1], m0
719
%endif
720

    
721
    lea       r0, [r0+r1*2]
722
    lea       r2, [r2+r3*2]
723
    sub       r4, 2
724
    jg .nextrow
725
    REP_RET
726

    
727
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
728
    mov      r6d, 8*16
729
    shl      r5d, 4
730
    sub      r6d, r5d
731
%ifdef PIC
732
    lea      r11, [bilinear_filter_vw_m]
733
%endif
734
    pxor      m6, m6
735
    mova      m4, [bilinear_filter_vw+r6-16]
736
    mova      m5, [bilinear_filter_vw+r5-16]
737
.nextrow
738
    movh      m0, [r2+r3*0+0]
739
    movh      m1, [r2+r3*0+1]
740
    movh      m2, [r2+r3*1+0]
741
    movh      m3, [r2+r3*1+1]
742
    punpcklbw m0, m6
743
    punpcklbw m1, m6
744
    punpcklbw m2, m6
745
    punpcklbw m3, m6
746
    pmullw    m0, m4
747
    pmullw    m1, m5
748
    pmullw    m2, m4
749
    pmullw    m3, m5
750
    paddsw    m0, m1
751
    paddsw    m2, m3
752
    psraw     m0, 2
753
    psraw     m2, 2
754
    pavgw     m0, m6
755
    pavgw     m2, m6
756
%ifidn %1, mmxext
757
    packuswb  m0, m0
758
    packuswb  m2, m2
759
    movh [r0+r1*0], m0
760
    movh [r0+r1*1], m2
761
%else
762
    packuswb  m0, m2
763
    movh   [r0+r1*0], m0
764
    movhps [r0+r1*1], m0
765
%endif
766

    
767
    lea       r0, [r0+r1*2]
768
    lea       r2, [r2+r3*2]
769
    sub       r4, 2
770
    jg .nextrow
771
    REP_RET
772
%endmacro
773

    
774
INIT_MMX
775
FILTER_BILINEAR mmxext, 4, 0
776
INIT_XMM
777
FILTER_BILINEAR   sse2, 8, 7
778

    
779
%macro FILTER_BILINEAR_SSSE3 1
780
cglobal put_vp8_bilinear%1_v_ssse3, 7,7
781
    shl      r6d, 4
782
%ifdef PIC
783
    lea      r11, [bilinear_filter_vb_m]
784
%endif
785
    pxor      m4, m4
786
    mova      m3, [bilinear_filter_vb+r6-16]
787
.nextrow
788
    movh      m0, [r2+r3*0]
789
    movh      m1, [r2+r3*1]
790
    movh      m2, [r2+r3*2]
791
    punpcklbw m0, m1
792
    punpcklbw m1, m2
793
    pmaddubsw m0, m3
794
    pmaddubsw m1, m3
795
    psraw     m0, 2
796
    psraw     m1, 2
797
    pavgw     m0, m4
798
    pavgw     m1, m4
799
%if mmsize==8
800
    packuswb  m0, m0
801
    packuswb  m1, m1
802
    movh [r0+r1*0], m0
803
    movh [r0+r1*1], m1
804
%else
805
    packuswb  m0, m1
806
    movh   [r0+r1*0], m0
807
    movhps [r0+r1*1], m0
808
%endif
809

    
810
    lea       r0, [r0+r1*2]
811
    lea       r2, [r2+r3*2]
812
    sub       r4, 2
813
    jg .nextrow
814
    REP_RET
815

    
816
cglobal put_vp8_bilinear%1_h_ssse3, 7,7
817
    shl      r5d, 4
818
%ifdef PIC
819
    lea      r11, [bilinear_filter_vb_m]
820
%endif
821
    pxor      m4, m4
822
    mova      m2, [filter_h2_shuf]
823
    mova      m3, [bilinear_filter_vb+r5-16]
824
.nextrow
825
    movu      m0, [r2+r3*0]
826
    movu      m1, [r2+r3*1]
827
    pshufb    m0, m2
828
    pshufb    m1, m2
829
    pmaddubsw m0, m3
830
    pmaddubsw m1, m3
831
    psraw     m0, 2
832
    psraw     m1, 2
833
    pavgw     m0, m4
834
    pavgw     m1, m4
835
%if mmsize==8
836
    packuswb  m0, m0
837
    packuswb  m1, m1
838
    movh [r0+r1*0], m0
839
    movh [r0+r1*1], m1
840
%else
841
    packuswb  m0, m1
842
    movh   [r0+r1*0], m0
843
    movhps [r0+r1*1], m0
844
%endif
845

    
846
    lea       r0, [r0+r1*2]
847
    lea       r2, [r2+r3*2]
848
    sub       r4, 2
849
    jg .nextrow
850
    REP_RET
851
%endmacro
852

    
853
INIT_MMX
854
FILTER_BILINEAR_SSSE3 4
855
INIT_XMM
856
FILTER_BILINEAR_SSSE3 8
857

    
858
cglobal put_vp8_pixels8_mmx, 5,5
859
.nextrow:
860
    movq  mm0, [r2+r3*0]
861
    movq  mm1, [r2+r3*1]
862
    lea    r2, [r2+r3*2]
863
    movq [r0+r1*0], mm0
864
    movq [r0+r1*1], mm1
865
    lea    r0, [r0+r1*2]
866
    sub   r4d, 2
867
    jg .nextrow
868
    REP_RET
869

    
870
cglobal put_vp8_pixels16_mmx, 5,5
871
.nextrow:
872
    movq  mm0, [r2+r3*0+0]
873
    movq  mm1, [r2+r3*0+8]
874
    movq  mm2, [r2+r3*1+0]
875
    movq  mm3, [r2+r3*1+8]
876
    lea    r2, [r2+r3*2]
877
    movq [r0+r1*0+0], mm0
878
    movq [r0+r1*0+8], mm1
879
    movq [r0+r1*1+0], mm2
880
    movq [r0+r1*1+8], mm3
881
    lea    r0, [r0+r1*2]
882
    sub   r4d, 2
883
    jg .nextrow
884
    REP_RET
885

    
886
cglobal put_vp8_pixels16_sse, 5,5,2
887
.nextrow:
888
    movups xmm0, [r2+r3*0]
889
    movups xmm1, [r2+r3*1]
890
    lea     r2, [r2+r3*2]
891
    movaps [r0+r1*0], xmm0
892
    movaps [r0+r1*1], xmm1
893
    lea     r0, [r0+r1*2]
894
    sub    r4d, 2
895
    jg .nextrow
896
    REP_RET
897

    
898
;-----------------------------------------------------------------------------
899
; IDCT functions:
900
;
901
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
902
;-----------------------------------------------------------------------------
903

    
904
cglobal vp8_idct_dc_add_mmx, 3, 3
905
    ; load data
906
    movd       mm0, [r1]
907

    
908
    ; calculate DC
909
    paddw      mm0, [pw_4]
910
    pxor       mm1, mm1
911
    psraw      mm0, 3
912
    psubw      mm1, mm0
913
    packuswb   mm0, mm0
914
    packuswb   mm1, mm1
915
    punpcklbw  mm0, mm0
916
    punpcklbw  mm1, mm1
917
    punpcklwd  mm0, mm0
918
    punpcklwd  mm1, mm1
919

    
920
    ; add DC
921
    lea         r1, [r0+r2*2]
922
    movd       mm2, [r0]
923
    movd       mm3, [r0+r2]
924
    movd       mm4, [r1]
925
    movd       mm5, [r1+r2]
926
    paddusb    mm2, mm0
927
    paddusb    mm3, mm0
928
    paddusb    mm4, mm0
929
    paddusb    mm5, mm0
930
    psubusb    mm2, mm1
931
    psubusb    mm3, mm1
932
    psubusb    mm4, mm1
933
    psubusb    mm5, mm1
934
    movd      [r0], mm2
935
    movd   [r0+r2], mm3
936
    movd      [r1], mm4
937
    movd   [r1+r2], mm5
938
    RET
939

    
940
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
941
    ; load data
942
    movd       xmm0, [r1]
943
    lea          r1, [r0+r2*2]
944
    pxor       xmm1, xmm1
945
    movq       xmm2, [pw_4]
946

    
947
    ; calculate DC
948
    paddw      xmm0, xmm2
949
    movd       xmm2, [r0]
950
    movd       xmm3, [r0+r2]
951
    movd       xmm4, [r1]
952
    movd       xmm5, [r1+r2]
953
    psraw      xmm0, 3
954
    pshuflw    xmm0, xmm0, 0
955
    punpcklqdq xmm0, xmm0
956
    punpckldq  xmm2, xmm3
957
    punpckldq  xmm4, xmm5
958
    punpcklbw  xmm2, xmm1
959
    punpcklbw  xmm4, xmm1
960
    paddw      xmm2, xmm0
961
    paddw      xmm4, xmm0
962
    packuswb   xmm2, xmm4
963
    movd       [r0], xmm2
964
    pextrd  [r0+r2], xmm2, 1
965
    pextrd     [r1], xmm2, 2
966
    pextrd  [r1+r2], xmm2, 3
967
    RET
968

    
969
;-----------------------------------------------------------------------------
970
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
971
;-----------------------------------------------------------------------------
972

    
973
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
974
;           this macro assumes that m6/m7 have words for 20091/17734 loaded
975
%macro VP8_MULTIPLY_SUMSUB 4
976
    mova      %3, %1
977
    mova      %4, %2
978
    pmulhw    %3, m6 ;20091(1)
979
    pmulhw    %4, m6 ;20091(2)
980
    paddw     %3, %1
981
    paddw     %4, %2
982
    paddw     %1, %1
983
    paddw     %2, %2
984
    pmulhw    %1, m7 ;35468(1)
985
    pmulhw    %2, m7 ;35468(2)
986
    psubw     %1, %4
987
    paddw     %2, %3
988
%endmacro
989

    
990
; calculate x0=%1+%3; x1=%1-%3
991
;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
992
;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
993
;           %5/%6 are temporary registers
994
;           we assume m6/m7 have constant words 20091/17734 loaded in them
995
%macro VP8_IDCT_TRANSFORM4x4_1D 6
996
    SUMSUB_BA           m%3, m%1, m%5     ;t0, t1
997
    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
998
    SUMSUB_BA           m%4, m%3, m%5     ;tmp0, tmp3
999
    SUMSUB_BA           m%2, m%1, m%5     ;tmp1, tmp2
1000
    SWAP                 %4,  %1
1001
    SWAP                 %4,  %3
1002
%endmacro
1003

    
1004
INIT_MMX
1005
cglobal vp8_idct_add_mmx, 3, 3
1006
    ; load block data
1007
    movq         m0, [r1]
1008
    movq         m1, [r1+8]
1009
    movq         m2, [r1+16]
1010
    movq         m3, [r1+24]
1011
    movq         m6, [pw_20091]
1012
    movq         m7, [pw_17734]
1013

    
1014
    ; actual IDCT
1015
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1016
    TRANSPOSE4x4W            0, 1, 2, 3, 4
1017
    paddw        m0, [pw_4]
1018
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1019
    TRANSPOSE4x4W            0, 1, 2, 3, 4
1020

    
1021
    ; store
1022
    pxor         m4, m4
1023
    lea          r1, [r0+2*r2]
1024
    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
1025
    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
1026

    
1027
    RET
1028

    
1029
;-----------------------------------------------------------------------------
1030
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
1031
;-----------------------------------------------------------------------------
1032

    
1033
%macro SCATTER_WHT 1
1034
    pextrw r1d, m0, %1
1035
    pextrw r2d, m1, %1
1036
    mov [r0+2*16*0], r1w
1037
    mov [r0+2*16*1], r2w
1038
    pextrw r1d, m2, %1
1039
    pextrw r2d, m3, %1
1040
    mov [r0+2*16*2], r1w
1041
    mov [r0+2*16*3], r2w
1042
%endmacro
1043

    
1044
%macro HADAMARD4_1D 4
1045
    SUMSUB_BADC m%2, m%1, m%4, m%3
1046
    SUMSUB_BADC m%4, m%2, m%3, m%1
1047
    SWAP %1, %4, %3
1048
%endmacro
1049

    
1050
INIT_MMX
1051
cglobal vp8_luma_dc_wht_mmxext, 2,3
1052
    movq          m0, [r1]
1053
    movq          m1, [r1+8]
1054
    movq          m2, [r1+16]
1055
    movq          m3, [r1+24]
1056
    HADAMARD4_1D  0, 1, 2, 3
1057
    TRANSPOSE4x4W 0, 1, 2, 3, 4
1058
    paddw         m0, [pw_3]
1059
    HADAMARD4_1D  0, 1, 2, 3
1060
    psraw         m0, 3
1061
    psraw         m1, 3
1062
    psraw         m2, 3
1063
    psraw         m3, 3
1064
    SCATTER_WHT   0
1065
    add           r0, 2*16*4
1066
    SCATTER_WHT   1
1067
    add           r0, 2*16*4
1068
    SCATTER_WHT   2
1069
    add           r0, 2*16*4
1070
    SCATTER_WHT   3
1071
    RET
1072

    
1073
;-----------------------------------------------------------------------------
1074
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
1075
;-----------------------------------------------------------------------------
1076

    
1077
; macro called with 7 mm register indexes as argument, and 4 regular registers
1078
;
1079
; first 4 mm registers will carry the transposed pixel data
1080
; the other three are scratchspace (one would be sufficient, but this allows
1081
; for more spreading/pipelining and thus faster execution on OOE CPUs)
1082
;
1083
; first two regular registers are buf+4*stride and buf+5*stride
1084
; third is -stride, fourth is +stride
1085
%macro READ_8x4_INTERLEAVED 11
1086
    ; interleave 8 (A-H) rows of 4 pixels each
1087
    movd          m%1, [%8+%10*4]   ; A0-3
1088
    movd          m%5, [%9+%10*4]   ; B0-3
1089
    movd          m%2, [%8+%10*2]   ; C0-3
1090
    movd          m%6, [%8+%10]     ; D0-3
1091
    movd          m%3, [%8]         ; E0-3
1092
    movd          m%7, [%9]         ; F0-3
1093
    movd          m%4, [%9+%11]     ; G0-3
1094
    punpcklbw     m%1, m%5          ; A/B interleaved
1095
    movd          m%5, [%9+%11*2]   ; H0-3
1096
    punpcklbw     m%2, m%6          ; C/D interleaved
1097
    punpcklbw     m%3, m%7          ; E/F interleaved
1098
    punpcklbw     m%4, m%5          ; G/H interleaved
1099
%endmacro
1100

    
1101
; macro called with 7 mm register indexes as argument, and 5 regular registers
1102
; first 11 mean the same as READ_8x4_TRANSPOSED above
1103
; fifth regular register is scratchspace to reach the bottom 8 rows, it
1104
; will be set to second regular register + 8*stride at the end
1105
%macro READ_16x4_INTERLEAVED 12
1106
    ; transpose 16 (A-P) rows of 4 pixels each
1107
    lea           %12, [r0+8*r2]
1108

    
1109
    ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
1110
    movd          m%1, [%8+%10*4]   ; A0-3
1111
    movd          m%3, [%12+%10*4]  ; I0-3
1112
    movd          m%2, [%8+%10*2]   ; C0-3
1113
    movd          m%4, [%12+%10*2]  ; K0-3
1114
    movd          m%6, [%8+%10]     ; D0-3
1115
    movd          m%5, [%12+%10]    ; L0-3
1116
    movd          m%7, [%12]        ; M0-3
1117
    add           %12, %11
1118
    punpcklbw     m%1, m%3          ; A/I
1119
    movd          m%3, [%8]         ; E0-3
1120
    punpcklbw     m%2, m%4          ; C/K
1121
    punpcklbw     m%6, m%5          ; D/L
1122
    punpcklbw     m%3, m%7          ; E/M
1123
    punpcklbw     m%2, m%6          ; C/D/K/L interleaved
1124

    
1125
    ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
1126
    movd         m%5, [%9+%10*4]   ; B0-3
1127
    movd         m%4, [%12+%10*4]  ; J0-3
1128
    movd         m%7, [%9]         ; F0-3
1129
    movd         m%6, [%12]        ; N0-3
1130
    punpcklbw    m%5, m%4          ; B/J
1131
    punpcklbw    m%7, m%6          ; F/N
1132
    punpcklbw    m%1, m%5          ; A/B/I/J interleaved
1133
    punpcklbw    m%3, m%7          ; E/F/M/N interleaved
1134
    movd         m%4, [%9+%11]     ; G0-3
1135
    movd         m%6, [%12+%11]    ; O0-3
1136
    movd         m%5, [%9+%11*2]   ; H0-3
1137
    movd         m%7, [%12+%11*2]  ; P0-3
1138
    punpcklbw    m%4, m%6          ; G/O
1139
    punpcklbw    m%5, m%7          ; H/P
1140
    punpcklbw    m%4, m%5          ; G/H/O/P interleaved
1141
%endmacro
1142

    
1143
; write 4 mm registers of 2 dwords each
1144
; first four arguments are mm register indexes containing source data
1145
; last four are registers containing buf+4*stride, buf+5*stride,
1146
; -stride and +stride
1147
%macro WRITE_4x2D 8
1148
    ; write out (2 dwords per register)
1149
    movd    [%5+%7*4], m%1
1150
    movd    [%5+%7*2], m%2
1151
    movd         [%5], m%3
1152
    movd      [%6+%8], m%4
1153
    punpckhdq     m%1, m%1
1154
    punpckhdq     m%2, m%2
1155
    punpckhdq     m%3, m%3
1156
    punpckhdq     m%4, m%4
1157
    movd    [%6+%7*4], m%1
1158
    movd      [%5+%7], m%2
1159
    movd         [%6], m%3
1160
    movd    [%6+%8*2], m%4
1161
%endmacro
1162

    
1163
; write 4 xmm registers of 4 dwords each
1164
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
1165
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
1166
; we add 1*stride to the third regular registry in the process
1167
; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
1168
; same memory region), or 8 if they cover two separate buffers (third one points to
1169
; a different memory region than the first two), allowing for more optimal code for
1170
; the 16-width case
1171
%macro WRITE_4x4D 10
1172
    ; write out (4 dwords per register), start with dwords zero
1173
    movd    [%5+%8*4], m%1
1174
    movd         [%5], m%2
1175
    movd    [%7+%8*4], m%3
1176
    movd         [%7], m%4
1177

    
1178
    ; store dwords 1
1179
    psrldq        m%1, 4
1180
    psrldq        m%2, 4
1181
    psrldq        m%3, 4
1182
    psrldq        m%4, 4
1183
    movd    [%6+%8*4], m%1
1184
    movd         [%6], m%2
1185
%if %10 == 16
1186
    movd    [%6+%9*4], m%3
1187
%endif
1188
    movd      [%7+%9], m%4
1189

    
1190
    ; write dwords 2
1191
    psrldq        m%1, 4
1192
    psrldq        m%2, 4
1193
%if %10 == 8
1194
    movd    [%5+%8*2], m%1
1195
    movd           %5, m%3
1196
%endif
1197
    psrldq        m%3, 4
1198
    psrldq        m%4, 4
1199
%if %10 == 16
1200
    movd    [%5+%8*2], m%1
1201
%endif
1202
    movd      [%6+%9], m%2
1203
    movd    [%7+%8*2], m%3
1204
    movd    [%7+%9*2], m%4
1205
    add            %7, %9
1206

    
1207
    ; store dwords 3
1208
    psrldq        m%1, 4
1209
    psrldq        m%2, 4
1210
    psrldq        m%3, 4
1211
    psrldq        m%4, 4
1212
%if %10 == 8
1213
    mov     [%7+%8*4], %5d
1214
    movd    [%6+%8*2], m%1
1215
%else
1216
    movd      [%5+%8], m%1
1217
%endif
1218
    movd    [%6+%9*2], m%2
1219
    movd    [%7+%8*2], m%3
1220
    movd    [%7+%9*2], m%4
1221
%endmacro
1222

    
1223
%macro SPLATB_REG 3
1224
    movd           %1, %2
1225
    punpcklbw      %1, %1
1226
%if mmsize == 16 ; sse2
1227
    punpcklwd      %1, %1
1228
    pshufd         %1, %1, 0x0
1229
%elifidn %3, mmx
1230
    punpcklwd      %1, %1
1231
    punpckldq      %1, %1
1232
%else ; mmxext
1233
    pshufw         %1, %1, 0x0
1234
%endif
1235
%endmacro
1236

    
1237
%macro SIMPLE_LOOPFILTER 3
1238
cglobal vp8_%2_loop_filter_simple_%1, 3, %3
1239
%ifidn %2, h
1240
    mov            r5, rsp          ; backup stack pointer
1241
    and           rsp, ~(mmsize-1)  ; align stack
1242
%endif
1243
%if mmsize == 8 ; mmx/mmxext
1244
    mov            r3, 2
1245
%endif
1246
    SPLATB_REG     m7, r2, %1       ; splat "flim" into register
1247

    
1248
    ; set up indexes to address 4 rows
1249
    mov            r2, r1
1250
    neg            r1
1251
%ifidn %2, h
1252
    lea            r0, [r0+4*r2-2]
1253
    sub           rsp, mmsize*2     ; (aligned) storage space for saving p1/q1
1254
%endif
1255

    
1256
%if mmsize == 8 ; mmx / mmxext
1257
.next8px
1258
%endif
1259
%ifidn %2, v
1260
    ; read 4 half/full rows of pixels
1261
    mova           m0, [r0+r1*2]    ; p1
1262
    mova           m1, [r0+r1]      ; p0
1263
    mova           m2, [r0]         ; q0
1264
    mova           m3, [r0+r2]      ; q1
1265
%else ; h
1266
    lea            r4, [r0+r2]
1267

    
1268
%if mmsize == 8 ; mmx/mmxext
1269
    READ_8x4_INTERLEAVED  0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
1270
%else ; sse2
1271
    READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
1272
%endif
1273
    TRANSPOSE4x4W         0, 1, 2, 3, 4
1274

    
1275
    mova        [rsp], m0           ; store p1
1276
    mova [rsp+mmsize], m3           ; store q1
1277
%endif
1278

    
1279
    ; simple_limit
1280
    mova           m5, m2           ; m5=backup of q0
1281
    mova           m6, m1           ; m6=backup of p0
1282
    psubusb        m1, m2           ; p0-q0
1283
    psubusb        m2, m6           ; q0-p0
1284
    por            m1, m2           ; FFABS(p0-q0)
1285
    paddusb        m1, m1           ; m1=FFABS(p0-q0)*2
1286

    
1287
    mova           m4, m3
1288
    mova           m2, m0
1289
    psubusb        m3, m0           ; q1-p1
1290
    psubusb        m0, m4           ; p1-q1
1291
    por            m3, m0           ; FFABS(p1-q1)
1292
    mova           m0, [pb_80]
1293
    pxor           m2, m0
1294
    pxor           m4, m0
1295
    psubsb         m2, m4           ; m2=p1-q1 (signed) backup for below
1296
    pand           m3, [pb_FE]
1297
    psrlq          m3, 1            ; m3=FFABS(p1-q1)/2, this can be used signed
1298
    paddusb        m3, m1
1299
    psubusb        m3, m7
1300
    pxor           m1, m1
1301
    pcmpeqb        m3, m1           ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
1302

    
1303
    ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
1304
    mova           m4, m5
1305
    pxor           m5, m0
1306
    pxor           m0, m6
1307
    psubsb         m5, m0           ; q0-p0 (signed)
1308
    paddsb         m2, m5
1309
    paddsb         m2, m5
1310
    paddsb         m2, m5           ; a=(p1-q1) + 3*(q0-p0)
1311
    pand           m2, m3           ; apply filter mask (m3)
1312

    
1313
    mova           m3, [pb_F8]
1314
    mova           m1, m2
1315
    paddsb         m2, [pb_4]       ; f1<<3=a+4
1316
    paddsb         m1, [pb_3]       ; f2<<3=a+3
1317
    pand           m2, m3
1318
    pand           m1, m3           ; cache f2<<3
1319

    
1320
    pxor           m0, m0
1321
    pxor           m3, m3
1322
    pcmpgtb        m0, m2           ; which values are <0?
1323
    psubb          m3, m2           ; -f1<<3
1324
    psrlq          m2, 3            ; +f1
1325
    psrlq          m3, 3            ; -f1
1326
    pand           m3, m0
1327
    pandn          m0, m2
1328
    psubusb        m4, m0
1329
    paddusb        m4, m3           ; q0-f1
1330

    
1331
    pxor           m0, m0
1332
    pxor           m3, m3
1333
    pcmpgtb        m0, m1           ; which values are <0?
1334
    psubb          m3, m1           ; -f2<<3
1335
    psrlq          m1, 3            ; +f2
1336
    psrlq          m3, 3            ; -f2
1337
    pand           m3, m0
1338
    pandn          m0, m1
1339
    paddusb        m6, m0
1340
    psubusb        m6, m3           ; p0+f2
1341

    
1342
    ; store
1343
%ifidn %2, v
1344
    mova         [r0], m4
1345
    mova      [r0+r1], m6
1346
%else ; h
1347
    mova           m0, [rsp]        ; p1
1348
    SWAP            2, 4            ; p0
1349
    SWAP            1, 6            ; q0
1350
    mova           m3, [rsp+mmsize] ; q1
1351

    
1352
    TRANSPOSE4x4B  0, 1, 2, 3, 4
1353
%if mmsize == 16 ; sse2
1354
    add            r3, r1           ; change from r4*8*stride to r0+8*stride
1355
    WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16
1356
%else ; mmx/mmxext
1357
    WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2
1358
%endif
1359
%endif
1360

    
1361
%if mmsize == 8 ; mmx/mmxext
1362
    ; next 8 pixels
1363
%ifidn %2, v
1364
    add            r0, 8            ; advance 8 cols = pixels
1365
%else ; h
1366
    lea            r0, [r0+r2*8]    ; advance 8 rows = lines
1367
%endif
1368
    dec            r3
1369
    jg .next8px
1370
%ifidn %2, v
1371
    REP_RET
1372
%else ; h
1373
    mov           rsp, r5           ; restore stack pointer
1374
    RET
1375
%endif
1376
%else ; sse2
1377
%ifidn %2, h
1378
    mov           rsp, r5           ; restore stack pointer
1379
%endif
1380
    RET
1381
%endif
1382
%endmacro
1383

    
1384
INIT_MMX
1385
SIMPLE_LOOPFILTER mmx,    v, 4
1386
SIMPLE_LOOPFILTER mmx,    h, 6
1387
SIMPLE_LOOPFILTER mmxext, v, 4
1388
SIMPLE_LOOPFILTER mmxext, h, 6
1389
INIT_XMM
1390
SIMPLE_LOOPFILTER sse2,   v, 3
1391
SIMPLE_LOOPFILTER sse2,   h, 6
1392

    
1393
;-----------------------------------------------------------------------------
1394
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
1395
;                                            int flimE, int flimI, int hev_thr);
1396
;-----------------------------------------------------------------------------
1397

    
1398
%macro INNER_LOOPFILTER 5
1399
%if %4 == 8 ; chroma
1400
cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5
1401
%define dst8_reg    r1
1402
%define mstride_reg r2
1403
%define E_reg       r3
1404
%define I_reg       r4
1405
%define hev_thr_reg r5
1406
%else ; luma
1407
cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
1408
%define mstride_reg r1
1409
%define E_reg       r2
1410
%define I_reg       r3
1411
%define hev_thr_reg r4
1412
%ifdef m8 ; x86-64, sse2
1413
%define dst8_reg    r4
1414
%elif mmsize == 16 ; x86-32, sse2
1415
%define dst8_reg    r5
1416
%else ; x86-32, mmx/mmxext
1417
%define cnt_reg     r5
1418
%endif
1419
%endif
1420
%define dst_reg     r0
1421
%define stride_reg  E_reg
1422
%define dst2_reg    I_reg
1423
%ifndef m8
1424
%define stack_reg   hev_thr_reg
1425
%endif
1426

    
1427
%ifndef m8 ; mmx/mmxext or sse2 on x86-32
1428
    ; splat function arguments
1429
    SPLATB_REG       m0, E_reg, %1   ; E
1430
    SPLATB_REG       m1, I_reg, %1   ; I
1431
    SPLATB_REG       m2, hev_thr_reg, %1 ; hev_thresh
1432

    
1433
    ; align stack
1434
    mov       stack_reg, rsp         ; backup stack pointer
1435
    and             rsp, ~(mmsize-1) ; align stack
1436
%ifidn %2, v
1437
    sub             rsp, mmsize * 4  ; stack layout: [0]=E, [1]=I, [2]=hev_thr
1438
                                     ;               [3]=hev() result
1439
%else ; h
1440
    sub             rsp, mmsize * 5  ; extra storage space for transposes
1441
%endif
1442

    
1443
%define flim_E   [rsp]
1444
%define flim_I   [rsp+mmsize]
1445
%define hev_thr  [rsp+mmsize*2]
1446
%define mask_res [rsp+mmsize*3]
1447
%define p0backup [rsp+mmsize*3]
1448
%define q0backup [rsp+mmsize*4]
1449

    
1450
    mova         flim_E, m0
1451
    mova         flim_I, m1
1452
    mova        hev_thr, m2
1453

    
1454
%else ; sse2 on x86-64
1455

    
1456
%define flim_E   m9
1457
%define flim_I   m10
1458
%define hev_thr  m11
1459
%define mask_res m12
1460
%define p0backup m12
1461
%define q0backup m8
1462

    
1463
    ; splat function arguments
1464
    SPLATB_REG   flim_E, E_reg, %1   ; E
1465
    SPLATB_REG   flim_I, I_reg, %1   ; I
1466
    SPLATB_REG  hev_thr, hev_thr_reg, %1 ; hev_thresh
1467
%endif
1468

    
1469
%if mmsize == 8 && %4 == 16 ; mmx/mmxext
1470
    mov         cnt_reg, 2
1471
%endif
1472
    mov      stride_reg, mstride_reg
1473
    neg     mstride_reg
1474
%ifidn %2, h
1475
    lea         dst_reg, [dst_reg + stride_reg*4-4]
1476
%if %4 == 8
1477
    lea        dst8_reg, [dst8_reg+ stride_reg*4-4]
1478
%endif
1479
%endif
1480

    
1481
%if mmsize == 8
1482
.next8px
1483
%endif
1484
    ; read
1485
    lea        dst2_reg, [dst_reg + stride_reg]
1486
%ifidn %2, v
1487
%if %4 == 8 && mmsize == 16
1488
%define movrow movh
1489
%else
1490
%define movrow mova
1491
%endif
1492
    movrow           m0, [dst_reg +mstride_reg*4] ; p3
1493
    movrow           m1, [dst2_reg+mstride_reg*4] ; p2
1494
    movrow           m2, [dst_reg +mstride_reg*2] ; p1
1495
    movrow           m5, [dst2_reg]               ; q1
1496
    movrow           m6, [dst2_reg+ stride_reg]   ; q2
1497
    movrow           m7, [dst2_reg+ stride_reg*2] ; q3
1498
%if mmsize == 16 && %4 == 8
1499
    movhps           m0, [dst8_reg+mstride_reg*4]
1500
    movhps           m2, [dst8_reg+mstride_reg*2]
1501
    add        dst8_reg, stride_reg
1502
    movhps           m1, [dst8_reg+mstride_reg*4]
1503
    movhps           m5, [dst8_reg]
1504
    movhps           m6, [dst8_reg+ stride_reg]
1505
    movhps           m7, [dst8_reg+ stride_reg*2]
1506
    add        dst8_reg, mstride_reg
1507
%endif
1508
%elif mmsize == 8 ; mmx/mmxext (h)
1509
    ; read 8 rows of 8px each
1510
    movu             m0, [dst_reg +mstride_reg*4]
1511
    movu             m1, [dst2_reg+mstride_reg*4]
1512
    movu             m2, [dst_reg +mstride_reg*2]
1513
    movu             m3, [dst_reg +mstride_reg]
1514
    movu             m4, [dst_reg]
1515
    movu             m5, [dst2_reg]
1516
    movu             m6, [dst2_reg+ stride_reg]
1517

    
1518
    ; 8x8 transpose
1519
    TRANSPOSE4x4B     0, 1, 2, 3, 7
1520
    mova       q0backup, m1
1521
    movu             m7, [dst2_reg+ stride_reg*2]
1522
    TRANSPOSE4x4B     4, 5, 6, 7, 1
1523
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
1524
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
1525
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1526
    mova             m1, q0backup
1527
    mova       q0backup, m2          ; store q0
1528
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1529
    mova       p0backup, m5          ; store p0
1530
    SWAP              1, 4
1531
    SWAP              2, 4
1532
    SWAP              6, 3
1533
    SWAP              5, 3
1534
%else ; sse2 (h)
1535
%if %4 == 16
1536
    lea        dst8_reg, [dst_reg + stride_reg*8]
1537
%endif
1538

    
1539
    ; read 16 rows of 8px each, interleave
1540
    movh             m0, [dst_reg +mstride_reg*4]
1541
    movh             m1, [dst8_reg+mstride_reg*4]
1542
    movh             m2, [dst_reg +mstride_reg*2]
1543
    movh             m5, [dst8_reg+mstride_reg*2]
1544
    movh             m3, [dst_reg +mstride_reg]
1545
    movh             m6, [dst8_reg+mstride_reg]
1546
    movh             m4, [dst_reg]
1547
    movh             m7, [dst8_reg]
1548
    punpcklbw        m0, m1          ; A/I
1549
    punpcklbw        m2, m5          ; C/K
1550
    punpcklbw        m3, m6          ; D/L
1551
    punpcklbw        m4, m7          ; E/M
1552

    
1553
    add        dst8_reg, stride_reg
1554
    movh             m1, [dst2_reg+mstride_reg*4]
1555
    movh             m6, [dst8_reg+mstride_reg*4]
1556
    movh             m5, [dst2_reg]
1557
    movh             m7, [dst8_reg]
1558
    punpcklbw        m1, m6          ; B/J
1559
    punpcklbw        m5, m7          ; F/N
1560
    movh             m6, [dst2_reg+ stride_reg]
1561
    movh             m7, [dst8_reg+ stride_reg]
1562
    punpcklbw        m6, m7          ; G/O
1563

    
1564
    ; 8x16 transpose
1565
    TRANSPOSE4x4B     0, 1, 2, 3, 7
1566
%ifdef m8
1567
    SWAP              1, 8
1568
%else
1569
    mova       q0backup, m1
1570
%endif
1571
    movh             m7, [dst2_reg+ stride_reg*2]
1572
    movh             m1, [dst8_reg+ stride_reg*2]
1573
    punpcklbw        m7, m1          ; H/P
1574
    TRANSPOSE4x4B     4, 5, 6, 7, 1
1575
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
1576
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
1577
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1578
%ifdef m8
1579
    SWAP              1, 8
1580
    SWAP              2, 8
1581
%else
1582
    mova             m1, q0backup
1583
    mova       q0backup, m2          ; store q0
1584
%endif
1585
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1586
%ifdef m12
1587
    SWAP              5, 12
1588
%else
1589
    mova       p0backup, m5          ; store p0
1590
%endif
1591
    SWAP              1, 4
1592
    SWAP              2, 4
1593
    SWAP              6, 3
1594
    SWAP              5, 3
1595
%endif
1596

    
1597
    ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
1598
    mova             m4, m1
1599
    SWAP              4, 1
1600
    psubusb          m4, m0          ; p2-p3
1601
    psubusb          m0, m1          ; p3-p2
1602
    por              m0, m4          ; abs(p3-p2)
1603

    
1604
    mova             m4, m2
1605
    SWAP              4, 2
1606
    psubusb          m4, m1          ; p1-p2
1607
    psubusb          m1, m2          ; p2-p1
1608
    por              m1, m4          ; abs(p2-p1)
1609

    
1610
    mova             m4, m6
1611
    SWAP              4, 6
1612
    psubusb          m4, m7          ; q2-q3
1613
    psubusb          m7, m6          ; q3-q2
1614
    por              m7, m4          ; abs(q3-q2)
1615

    
1616
    mova             m4, m5
1617
    SWAP              4, 5
1618
    psubusb          m4, m6          ; q1-q2
1619
    psubusb          m6, m5          ; q2-q1
1620
    por              m6, m4          ; abs(q2-q1)
1621

    
1622
%ifidn %1, mmx
1623
    mova             m4, flim_I
1624
    pxor             m3, m3
1625
    psubusb          m0, m4
1626
    psubusb          m1, m4
1627
    psubusb          m7, m4
1628
    psubusb          m6, m4
1629
    pcmpeqb          m0, m3          ; abs(p3-p2) <= I
1630
    pcmpeqb          m1, m3          ; abs(p2-p1) <= I
1631
    pcmpeqb          m7, m3          ; abs(q3-q2) <= I
1632
    pcmpeqb          m6, m3          ; abs(q2-q1) <= I
1633
    pand             m0, m1
1634
    pand             m7, m6
1635
    pand             m0, m7
1636
%else ; mmxext/sse2
1637
    pmaxub           m0, m1
1638
    pmaxub           m6, m7
1639
    pmaxub           m0, m6
1640
%endif
1641

    
1642
    ; normal_limit and high_edge_variance for p1-p0, q1-q0
1643
    SWAP              7, 3           ; now m7 is zero
1644
%ifidn %2, v
1645
    movrow           m3, [dst_reg +mstride_reg] ; p0
1646
%if mmsize == 16 && %4 == 8
1647
    movhps           m3, [dst8_reg+mstride_reg]
1648
%endif
1649
%elifdef m12
1650
    SWAP              3, 12
1651
%else
1652
    mova             m3, p0backup
1653
%endif
1654

    
1655
    mova             m1, m2
1656
    SWAP              1, 2
1657
    mova             m6, m3
1658
    SWAP              3, 6
1659
    psubusb          m1, m3          ; p1-p0
1660
    psubusb          m6, m2          ; p0-p1
1661
    por              m1, m6          ; abs(p1-p0)
1662
%ifidn %1, mmx
1663
    mova             m6, m1
1664
    psubusb          m1, m4
1665
    psubusb          m6, hev_thr
1666
    pcmpeqb          m1, m7          ; abs(p1-p0) <= I
1667
    pcmpeqb          m6, m7          ; abs(p1-p0) <= hev_thresh
1668
    pand             m0, m1
1669
    mova       mask_res, m6
1670
%else ; mmxext/sse2
1671
    pmaxub           m0, m1          ; max_I
1672
    SWAP              1, 4           ; max_hev_thresh
1673
%endif
1674

    
1675
    SWAP              6, 4           ; now m6 is I
1676
%ifidn %2, v
1677
    movrow           m4, [dst_reg]   ; q0
1678
%if mmsize == 16 && %4 == 8
1679
    movhps           m4, [dst8_reg]
1680
%endif
1681
%elifdef m8
1682
    SWAP              4, 8
1683
%else
1684
    mova             m4, q0backup
1685
%endif
1686
    mova             m1, m4
1687
    SWAP              1, 4
1688
    mova             m7, m5
1689
    SWAP              7, 5
1690
    psubusb          m1, m5          ; q0-q1
1691
    psubusb          m7, m4          ; q1-q0
1692
    por              m1, m7          ; abs(q1-q0)
1693
%ifidn %1, mmx
1694
    mova             m7, m1
1695
    psubusb          m1, m6
1696
    psubusb          m7, hev_thr
1697
    pxor             m6, m6
1698
    pcmpeqb          m1, m6          ; abs(q1-q0) <= I
1699
    pcmpeqb          m7, m6          ; abs(q1-q0) <= hev_thresh
1700
    mova             m6, mask_res
1701
    pand             m0, m1          ; abs([pq][321]-[pq][210]) <= I
1702
    pand             m6, m7
1703
%else ; mmxext/sse2
1704
    pxor             m7, m7
1705
    pmaxub           m0, m1
1706
    pmaxub           m6, m1
1707
    psubusb          m0, flim_I
1708
    psubusb          m6, hev_thr
1709
    pcmpeqb          m0, m7          ; max(abs(..)) <= I
1710
    pcmpeqb          m6, m7          ; !(max(abs..) > thresh)
1711
%endif
1712
%ifdef m12
1713
    SWAP              6, 12
1714
%else
1715
    mova       mask_res, m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1716
%endif
1717

    
1718
    ; simple_limit
1719
    mova             m1, m3
1720
    SWAP              1, 3
1721
    mova             m6, m4          ; keep copies of p0/q0 around for later use
1722
    SWAP              6, 4
1723
    psubusb          m1, m4          ; p0-q0
1724
    psubusb          m6, m3          ; q0-p0
1725
    por              m1, m6          ; abs(q0-p0)
1726
    paddusb          m1, m1          ; m1=2*abs(q0-p0)
1727

    
1728
    mova             m7, m2
1729
    SWAP              7, 2
1730
    mova             m6, m5
1731
    SWAP              6, 5
1732
    psubusb          m7, m5          ; p1-q1
1733
    psubusb          m6, m2          ; q1-p1
1734
    por              m7, m6          ; abs(q1-p1)
1735
    pxor             m6, m6
1736
    pand             m7, [pb_FE]
1737
    psrlq            m7, 1           ; abs(q1-p1)/2
1738
    paddusb          m7, m1          ; abs(q0-p0)*2+abs(q1-p1)/2
1739
    psubusb          m7, flim_E
1740
    pcmpeqb          m7, m6          ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
1741
    pand             m0, m7          ; normal_limit result
1742

    
1743
    ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
1744
%ifdef m8 ; x86-64 && sse2
1745
    mova             m8, [pb_80]
1746
%define pb_80_var m8
1747
%else ; x86-32 or mmx/mmxext
1748
%define pb_80_var [pb_80]
1749
%endif
1750
    mova             m1, m4
1751
    mova             m7, m3
1752
    pxor             m1, pb_80_var
1753
    pxor             m7, pb_80_var
1754
    psubsb           m1, m7          ; (signed) q0-p0
1755
    mova             m6, m2
1756
    mova             m7, m5
1757
    pxor             m6, pb_80_var
1758
    pxor             m7, pb_80_var
1759
    psubsb           m6, m7          ; (signed) p1-q1
1760
    mova             m7, mask_res
1761
    pandn            m7, m6
1762
    paddsb           m7, m1
1763
    paddsb           m7, m1
1764
    paddsb           m7, m1          ; 3*(q0-p0)+is4tap?(p1-q1)
1765

    
1766
    pand             m7, m0
1767
    mova             m1, [pb_F8]
1768
    mova             m6, m7
1769
    paddsb           m7, [pb_3]
1770
    paddsb           m6, [pb_4]
1771
    pand             m7, m1
1772
    pand             m6, m1
1773

    
1774
    pxor             m1, m1
1775
    pxor             m0, m0
1776
    pcmpgtb          m1, m7
1777
    psubb            m0, m7
1778
    psrlq            m7, 3           ; +f2
1779
    psrlq            m0, 3           ; -f2
1780
    pand             m0, m1
1781
    pandn            m1, m7
1782
    psubusb          m3, m0
1783
    paddusb          m3, m1          ; p0+f2
1784

    
1785
    pxor             m1, m1
1786
    pxor             m0, m0
1787
    pcmpgtb          m0, m6
1788
    psubb            m1, m6
1789
    psrlq            m6, 3           ; +f1
1790
    psrlq            m1, 3           ; -f1
1791
    pand             m1, m0
1792
    pandn            m0, m6
1793
    psubusb          m4, m0
1794
    paddusb          m4, m1          ; q0-f1
1795

    
1796
%ifdef m12
1797
    SWAP              6, 12
1798
%else
1799
    mova             m6, mask_res
1800
%endif
1801
%ifidn %1, mmx
1802
    mova             m7, [pb_1]
1803
%else ; mmxext/sse2
1804
    pxor             m7, m7
1805
%endif
1806
    pand             m0, m6
1807
    pand             m1, m6
1808
%ifidn %1, mmx
1809
    paddusb          m0, m7
1810
    pand             m1, [pb_FE]
1811
    pandn            m7, m0
1812
    psrlq            m1, 1
1813
    psrlq            m7, 1
1814
    SWAP              0, 7
1815
%else ; mmxext/sse2
1816
    psubusb          m1, [pb_1]
1817
    pavgb            m0, m7          ; a
1818
    pavgb            m1, m7          ; -a
1819
%endif
1820
    psubusb          m5, m0
1821
    psubusb          m2, m1
1822
    paddusb          m5, m1          ; q1-a
1823
    paddusb          m2, m0          ; p1+a
1824

    
1825
    ; store
1826
%ifidn %2, v
1827
    movrow [dst_reg +mstride_reg*2], m2
1828
    movrow [dst_reg +mstride_reg  ], m3
1829
    movrow    [dst_reg], m4
1830
    movrow [dst_reg + stride_reg  ], m5
1831
%if mmsize == 16 && %4 == 8
1832
    movhps [dst8_reg+mstride_reg*2], m2
1833
    movhps [dst8_reg+mstride_reg  ], m3
1834
    movhps   [dst8_reg], m4
1835
    movhps [dst8_reg+ stride_reg  ], m5
1836
%endif
1837
%else ; h
1838
    add         dst_reg, 2
1839
    add        dst2_reg, 2
1840

    
1841
    ; 4x8/16 transpose
1842
    TRANSPOSE4x4B     2, 3, 4, 5, 6
1843

    
1844
%if mmsize == 8 ; mmx/mmxext (h)
1845
    WRITE_4x2D        2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
1846
%else ; sse2 (h)
1847
    lea        dst8_reg, [dst8_reg+mstride_reg+2]
1848
    WRITE_4x4D        2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
1849
%endif
1850
%endif
1851

    
1852
%if mmsize == 8
1853
%if %4 == 8 ; chroma
1854
%ifidn %2, h
1855
    sub         dst_reg, 2
1856
%endif
1857
    cmp         dst_reg, dst8_reg
1858
    mov         dst_reg, dst8_reg
1859
    jnz .next8px
1860
%else
1861
%ifidn %2, h
1862
    lea         dst_reg, [dst_reg + stride_reg*8-2]
1863
%else ; v
1864
    add         dst_reg, 8
1865
%endif
1866
    dec         cnt_reg
1867
    jg .next8px
1868
%endif
1869
%endif
1870

    
1871
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
1872
    mov             rsp, stack_reg   ; restore stack pointer
1873
%endif
1874
    RET
1875
%endmacro
1876

    
1877
INIT_MMX
1878
INNER_LOOPFILTER mmx,    v, 6, 16, 8
1879
INNER_LOOPFILTER mmx,    h, 6, 16, 8
1880
INNER_LOOPFILTER mmxext, v, 6, 16, 8
1881
INNER_LOOPFILTER mmxext, h, 6, 16, 8
1882

    
1883
INNER_LOOPFILTER mmx,    v, 6,  8, 8
1884
INNER_LOOPFILTER mmx,    h, 6,  8, 8
1885
INNER_LOOPFILTER mmxext, v, 6,  8, 8
1886
INNER_LOOPFILTER mmxext, h, 6,  8, 8
1887

    
1888
INIT_XMM
1889
INNER_LOOPFILTER sse2,   v, 5, 16, 13
1890
%ifdef m8
1891
INNER_LOOPFILTER sse2,   h, 5, 16, 13
1892
%else
1893
INNER_LOOPFILTER sse2,   h, 6, 16, 13
1894
%endif
1895
INNER_LOOPFILTER sse2,   v, 6,  8, 13
1896
INNER_LOOPFILTER sse2,   h, 6,  8, 13