Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / vp8dsp.asm @ 3ae079a3

History | View | Annotate | Download (76.5 KB)

1
;******************************************************************************
2
;* VP8 MMXEXT optimizations
3
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5
;*
6
;* This file is part of FFmpeg.
7
;*
8
;* FFmpeg is free software; you can redistribute it and/or
9
;* modify it under the terms of the GNU Lesser General Public
10
;* License as published by the Free Software Foundation; either
11
;* version 2.1 of the License, or (at your option) any later version.
12
;*
13
;* FFmpeg is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
;* Lesser General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU Lesser General Public
19
;* License along with FFmpeg; if not, write to the Free Software
20
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
;******************************************************************************
22

    
23
%include "x86inc.asm"
24
%include "x86util.asm"
25

    
26
SECTION_RODATA
27

    
28
fourtap_filter_hw_m: times 4 dw  -6, 123
29
                     times 4 dw  12,  -1
30
                     times 4 dw  -9,  93
31
                     times 4 dw  50,  -6
32
                     times 4 dw  -6,  50
33
                     times 4 dw  93,  -9
34
                     times 4 dw  -1,  12
35
                     times 4 dw 123,  -6
36

    
37
sixtap_filter_hw_m:  times 4 dw   2, -11
38
                     times 4 dw 108,  36
39
                     times 4 dw  -8,   1
40
                     times 4 dw   3, -16
41
                     times 4 dw  77,  77
42
                     times 4 dw -16,   3
43
                     times 4 dw   1,  -8
44
                     times 4 dw  36, 108
45
                     times 4 dw -11,   2
46

    
47
fourtap_filter_hb_m: times 8 db  -6, 123
48
                     times 8 db  12,  -1
49
                     times 8 db  -9,  93
50
                     times 8 db  50,  -6
51
                     times 8 db  -6,  50
52
                     times 8 db  93,  -9
53
                     times 8 db  -1,  12
54
                     times 8 db 123,  -6
55

    
56
sixtap_filter_hb_m:  times 8 db   2,   1
57
                     times 8 db -11, 108
58
                     times 8 db  36,  -8
59
                     times 8 db   3,   3
60
                     times 8 db -16,  77
61
                     times 8 db  77, -16
62
                     times 8 db   1,   2
63
                     times 8 db  -8,  36
64
                     times 8 db 108, -11
65

    
66
fourtap_filter_v_m:  times 8 dw  -6
67
                     times 8 dw 123
68
                     times 8 dw  12
69
                     times 8 dw  -1
70
                     times 8 dw  -9
71
                     times 8 dw  93
72
                     times 8 dw  50
73
                     times 8 dw  -6
74
                     times 8 dw  -6
75
                     times 8 dw  50
76
                     times 8 dw  93
77
                     times 8 dw  -9
78
                     times 8 dw  -1
79
                     times 8 dw  12
80
                     times 8 dw 123
81
                     times 8 dw  -6
82

    
83
sixtap_filter_v_m:   times 8 dw   2
84
                     times 8 dw -11
85
                     times 8 dw 108
86
                     times 8 dw  36
87
                     times 8 dw  -8
88
                     times 8 dw   1
89
                     times 8 dw   3
90
                     times 8 dw -16
91
                     times 8 dw  77
92
                     times 8 dw  77
93
                     times 8 dw -16
94
                     times 8 dw   3
95
                     times 8 dw   1
96
                     times 8 dw  -8
97
                     times 8 dw  36
98
                     times 8 dw 108
99
                     times 8 dw -11
100
                     times 8 dw   2
101

    
102
bilinear_filter_vw_m: times 8 dw 1
103
                      times 8 dw 2
104
                      times 8 dw 3
105
                      times 8 dw 4
106
                      times 8 dw 5
107
                      times 8 dw 6
108
                      times 8 dw 7
109

    
110
bilinear_filter_vb_m: times 8 db 7, 1
111
                      times 8 db 6, 2
112
                      times 8 db 5, 3
113
                      times 8 db 4, 4
114
                      times 8 db 3, 5
115
                      times 8 db 2, 6
116
                      times 8 db 1, 7
117

    
118
%ifdef PIC
119
%define fourtap_filter_hw    r11
120
%define sixtap_filter_hw     r11
121
%define fourtap_filter_hb    r11
122
%define sixtap_filter_hb     r11
123
%define fourtap_filter_v     r11
124
%define sixtap_filter_v      r11
125
%define bilinear_filter_vw   r11
126
%define bilinear_filter_vb   r11
127
%else
128
%define fourtap_filter_hw fourtap_filter_hw_m
129
%define sixtap_filter_hw  sixtap_filter_hw_m
130
%define fourtap_filter_hb fourtap_filter_hb_m
131
%define sixtap_filter_hb  sixtap_filter_hb_m
132
%define fourtap_filter_v  fourtap_filter_v_m
133
%define sixtap_filter_v   sixtap_filter_v_m
134
%define bilinear_filter_vw bilinear_filter_vw_m
135
%define bilinear_filter_vb bilinear_filter_vb_m
136
%endif
137

    
138
filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
139
filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
140

    
141
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
142
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
143
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
144

    
145
pw_20091: times 4 dw 20091
146
pw_17734: times 4 dw 17734
147

    
148
cextern pb_1
149
cextern pw_3
150
cextern pb_3
151
cextern pw_4
152
cextern pb_4
153
cextern pw_9
154
cextern pw_18
155
cextern pw_27
156
cextern pw_63
157
cextern pw_64
158
cextern pb_80
159
cextern pb_F8
160
cextern pb_FE
161

    
162
SECTION .text
163

    
164
;-----------------------------------------------------------------------------
165
; subpel MC functions:
166
;
167
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
168
;                                              uint8_t *src, int srcstride,
169
;                                              int height,   int mx, int my);
170
;-----------------------------------------------------------------------------
171

    
172
%macro FILTER_SSSE3 3
173
cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
174
    lea      r5d, [r5*3]
175
    mova      m3, [filter_h6_shuf2]
176
    mova      m4, [filter_h6_shuf3]
177
%ifdef PIC
178
    lea      r11, [sixtap_filter_hb_m]
179
%endif
180
    mova      m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
181
    mova      m6, [sixtap_filter_hb+r5*8-32]
182
    mova      m7, [sixtap_filter_hb+r5*8-16]
183

    
184
.nextrow
185
    movu      m0, [r2-2]
186
    mova      m1, m0
187
    mova      m2, m0
188
%ifidn %1, 4
189
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
190
; shuffle with a memory operand
191
    punpcklbw m0, [r2+3]
192
%else
193
    pshufb    m0, [filter_h6_shuf1]
194
%endif
195
    pshufb    m1, m3
196
    pshufb    m2, m4
197
    pmaddubsw m0, m5
198
    pmaddubsw m1, m6
199
    pmaddubsw m2, m7
200
    paddsw    m0, m1
201
    paddsw    m0, m2
202
    paddsw    m0, [pw_64]
203
    psraw     m0, 7
204
    packuswb  m0, m0
205
    movh    [r0], m0        ; store
206

    
207
    ; go to next line
208
    add       r0, r1
209
    add       r2, r3
210
    dec       r4            ; next row
211
    jg .nextrow
212
    REP_RET
213

    
214
cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
215
    shl      r5d, 4
216
    mova      m2, [pw_64]
217
    mova      m3, [filter_h2_shuf]
218
    mova      m4, [filter_h4_shuf]
219
%ifdef PIC
220
    lea      r11, [fourtap_filter_hb_m]
221
%endif
222
    mova      m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
223
    mova      m6, [fourtap_filter_hb+r5]
224

    
225
.nextrow
226
    movu      m0, [r2-1]
227
    mova      m1, m0
228
    pshufb    m0, m3
229
    pshufb    m1, m4
230
    pmaddubsw m0, m5
231
    pmaddubsw m1, m6
232
    paddsw    m0, m2
233
    paddsw    m0, m1
234
    psraw     m0, 7
235
    packuswb  m0, m0
236
    movh    [r0], m0        ; store
237

    
238
    ; go to next line
239
    add       r0, r1
240
    add       r2, r3
241
    dec       r4            ; next row
242
    jg .nextrow
243
    REP_RET
244

    
245
cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
246
    shl      r6d, 4
247
%ifdef PIC
248
    lea      r11, [fourtap_filter_hb_m]
249
%endif
250
    mova      m5, [fourtap_filter_hb+r6-16]
251
    mova      m6, [fourtap_filter_hb+r6]
252
    mova      m7, [pw_64]
253

    
254
    ; read 3 lines
255
    sub       r2, r3
256
    movh      m0, [r2]
257
    movh      m1, [r2+  r3]
258
    movh      m2, [r2+2*r3]
259
    add       r2, r3
260

    
261
.nextrow
262
    movh      m3, [r2+2*r3]                ; read new row
263
    mova      m4, m0
264
    mova      m0, m1
265
    punpcklbw m4, m1
266
    mova      m1, m2
267
    punpcklbw m2, m3
268
    pmaddubsw m4, m5
269
    pmaddubsw m2, m6
270
    paddsw    m4, m2
271
    mova      m2, m3
272
    paddsw    m4, m7
273
    psraw     m4, 7
274
    packuswb  m4, m4
275
    movh    [r0], m4
276

    
277
    ; go to next line
278
    add        r0, r1
279
    add        r2, r3
280
    dec        r4                          ; next row
281
    jg .nextrow
282
    REP_RET
283

    
284
cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
285
    lea      r6d, [r6*3]
286
%ifdef PIC
287
    lea      r11, [sixtap_filter_hb_m]
288
%endif
289
    lea       r6, [sixtap_filter_hb+r6*8]
290

    
291
    ; read 5 lines
292
    sub       r2, r3
293
    sub       r2, r3
294
    movh      m0, [r2]
295
    movh      m1, [r2+r3]
296
    movh      m2, [r2+r3*2]
297
    lea       r2, [r2+r3*2]
298
    add       r2, r3
299
    movh      m3, [r2]
300
    movh      m4, [r2+r3]
301

    
302
.nextrow
303
    movh      m5, [r2+2*r3]                ; read new row
304
    mova      m6, m0
305
    punpcklbw m6, m5
306
    mova      m0, m1
307
    punpcklbw m1, m2
308
    mova      m7, m3
309
    punpcklbw m7, m4
310
    pmaddubsw m6, [r6-48]
311
    pmaddubsw m1, [r6-32]
312
    pmaddubsw m7, [r6-16]
313
    paddsw    m6, m1
314
    paddsw    m6, m7
315
    mova      m1, m2
316
    paddsw    m6, [pw_64]
317
    mova      m2, m3
318
    psraw     m6, 7
319
    mova      m3, m4
320
    packuswb  m6, m6
321
    mova      m4, m5
322
    movh    [r0], m6
323

    
324
    ; go to next line
325
    add        r0, r1
326
    add        r2, r3
327
    dec        r4                          ; next row
328
    jg .nextrow
329
    REP_RET
330
%endmacro
331

    
332
INIT_MMX
333
FILTER_SSSE3 4, 0, 0
334
INIT_XMM
335
FILTER_SSSE3 8, 8, 7
336

    
337
; 4x4 block, H-only 4-tap filter
338
cglobal put_vp8_epel4_h4_mmxext, 6, 6
339
    shl       r5d, 4
340
%ifdef PIC
341
    lea       r11, [fourtap_filter_hw_m]
342
%endif
343
    movq      mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
344
    movq      mm5, [fourtap_filter_hw+r5]
345
    movq      mm7, [pw_64]
346
    pxor      mm6, mm6
347

    
348
.nextrow
349
    movq      mm1, [r2-1]                  ; (ABCDEFGH) load 8 horizontal pixels
350

    
351
    ; first set of 2 pixels
352
    movq      mm2, mm1                     ; byte ABCD..
353
    punpcklbw mm1, mm6                     ; byte->word ABCD
354
    pshufw    mm0, mm2, 9                  ; byte CDEF..
355
    punpcklbw mm0, mm6                     ; byte->word CDEF
356
    pshufw    mm3, mm1, 0x94               ; word ABBC
357
    pshufw    mm1, mm0, 0x94               ; word CDDE
358
    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
359
    movq      mm0, mm1                     ; backup for second set of pixels
360
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
361
    paddd     mm3, mm1                     ; finish 1st 2px
362

    
363
    ; second set of 2 pixels, use backup of above
364
    punpckhbw mm2, mm6                     ; byte->word EFGH
365
    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
366
    pshufw    mm1, mm2, 0x94               ; word EFFG
367
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
368
    paddd     mm0, mm1                     ; finish 2nd 2px
369

    
370
    ; merge two sets of 2 pixels into one set of 4, round/clip/store
371
    packssdw  mm3, mm0                     ; merge dword->word (4px)
372
    paddsw    mm3, mm7                     ; rounding
373
    psraw     mm3, 7
374
    packuswb  mm3, mm6                     ; clip and word->bytes
375
    movd     [r0], mm3                     ; store
376

    
377
    ; go to next line
378
    add        r0, r1
379
    add        r2, r3
380
    dec        r4                          ; next row
381
    jg .nextrow
382
    REP_RET
383

    
384
; 4x4 block, H-only 6-tap filter
385
cglobal put_vp8_epel4_h6_mmxext, 6, 6
386
    lea       r5d, [r5*3]
387
%ifdef PIC
388
    lea       r11, [sixtap_filter_hw_m]
389
%endif
390
    movq      mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
391
    movq      mm5, [sixtap_filter_hw+r5*8-32]
392
    movq      mm6, [sixtap_filter_hw+r5*8-16]
393
    movq      mm7, [pw_64]
394
    pxor      mm3, mm3
395

    
396
.nextrow
397
    movq      mm1, [r2-2]                  ; (ABCDEFGH) load 8 horizontal pixels
398

    
399
    ; first set of 2 pixels
400
    movq      mm2, mm1                     ; byte ABCD..
401
    punpcklbw mm1, mm3                     ; byte->word ABCD
402
    pshufw    mm0, mm2, 0x9                ; byte CDEF..
403
    punpckhbw mm2, mm3                     ; byte->word EFGH
404
    punpcklbw mm0, mm3                     ; byte->word CDEF
405
    pshufw    mm1, mm1, 0x94               ; word ABBC
406
    pshufw    mm2, mm2, 0x94               ; word EFFG
407
    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
408
    pshufw    mm3, mm0, 0x94               ; word CDDE
409
    movq      mm0, mm3                     ; backup for second set of pixels
410
    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
411
    paddd     mm1, mm3                     ; add to 1st 2px cache
412
    movq      mm3, mm2                     ; backup for second set of pixels
413
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
414
    paddd     mm1, mm2                     ; finish 1st 2px
415

    
416
    ; second set of 2 pixels, use backup of above
417
    movd      mm2, [r2+3]                  ; byte FGHI (prevent overreads)
418
    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
419
    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
420
    paddd     mm0, mm3                     ; add to 2nd 2px cache
421
    pxor      mm3, mm3
422
    punpcklbw mm2, mm3                     ; byte->word FGHI
423
    pshufw    mm2, mm2, 0xE9               ; word GHHI
424
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
425
    paddd     mm0, mm2                     ; finish 2nd 2px
426

    
427
    ; merge two sets of 2 pixels into one set of 4, round/clip/store
428
    packssdw  mm1, mm0                     ; merge dword->word (4px)
429
    paddsw    mm1, mm7                     ; rounding
430
    psraw     mm1, 7
431
    packuswb  mm1, mm3                     ; clip and word->bytes
432
    movd     [r0], mm1                     ; store
433

    
434
    ; go to next line
435
    add        r0, r1
436
    add        r2, r3
437
    dec        r4                          ; next row
438
    jg .nextrow
439
    REP_RET
440

    
441
; 4x4 block, H-only 4-tap filter
442
INIT_XMM
443
cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
444
    shl      r5d, 4
445
%ifdef PIC
446
    lea      r11, [fourtap_filter_hw_m]
447
%endif
448
    mova      m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
449
    mova      m6, [fourtap_filter_hw+r5]
450
    pxor      m7, m7
451

    
452
.nextrow
453
    movh      m0, [r2-1]
454
    punpcklbw m0, m7        ; ABCDEFGH
455
    mova      m1, m0
456
    mova      m2, m0
457
    mova      m3, m0
458
    psrldq    m1, 2         ; BCDEFGH
459
    psrldq    m2, 4         ; CDEFGH
460
    psrldq    m3, 6         ; DEFGH
461
    punpcklwd m0, m1        ; ABBCCDDE
462
    punpcklwd m2, m3        ; CDDEEFFG
463
    pmaddwd   m0, m5
464
    pmaddwd   m2, m6
465
    paddd     m0, m2
466

    
467
    movh      m1, [r2+3]
468
    punpcklbw m1, m7        ; ABCDEFGH
469
    mova      m2, m1
470
    mova      m3, m1
471
    mova      m4, m1
472
    psrldq    m2, 2         ; BCDEFGH
473
    psrldq    m3, 4         ; CDEFGH
474
    psrldq    m4, 6         ; DEFGH
475
    punpcklwd m1, m2        ; ABBCCDDE
476
    punpcklwd m3, m4        ; CDDEEFFG
477
    pmaddwd   m1, m5
478
    pmaddwd   m3, m6
479
    paddd     m1, m3
480

    
481
    packssdw  m0, m1
482
    paddsw    m0, [pw_64]
483
    psraw     m0, 7
484
    packuswb  m0, m7
485
    movh    [r0], m0        ; store
486

    
487
    ; go to next line
488
    add       r0, r1
489
    add       r2, r3
490
    dec       r4            ; next row
491
    jg .nextrow
492
    REP_RET
493

    
494
cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
495
    lea      r5d, [r5*3]
496
%ifdef PIC
497
    lea      r11, [sixtap_filter_hw_m]
498
%endif
499
    lea       r5, [sixtap_filter_hw+r5*8]
500
    pxor      m7, m7
501

    
502
.nextrow
503
    movu      m0, [r2-2]
504
    mova      m6, m0
505
    mova      m4, m0
506
    punpcklbw m0, m7        ; ABCDEFGHI
507
    mova      m1, m0
508
    mova      m2, m0
509
    mova      m3, m0
510
    psrldq    m1, 2         ; BCDEFGH
511
    psrldq    m2, 4         ; CDEFGH
512
    psrldq    m3, 6         ; DEFGH
513
    psrldq    m4, 4
514
    punpcklbw m4, m7        ; EFGH
515
    mova      m5, m4
516
    psrldq    m5, 2         ; FGH
517
    punpcklwd m0, m1        ; ABBCCDDE
518
    punpcklwd m2, m3        ; CDDEEFFG
519
    punpcklwd m4, m5        ; EFFGGHHI
520
    pmaddwd   m0, [r5-48]
521
    pmaddwd   m2, [r5-32]
522
    pmaddwd   m4, [r5-16]
523
    paddd     m0, m2
524
    paddd     m0, m4
525

    
526
    psrldq    m6, 4
527
    mova      m4, m6
528
    punpcklbw m6, m7        ; ABCDEFGHI
529
    mova      m1, m6
530
    mova      m2, m6
531
    mova      m3, m6
532
    psrldq    m1, 2         ; BCDEFGH
533
    psrldq    m2, 4         ; CDEFGH
534
    psrldq    m3, 6         ; DEFGH
535
    psrldq    m4, 4
536
    punpcklbw m4, m7        ; EFGH
537
    mova      m5, m4
538
    psrldq    m5, 2         ; FGH
539
    punpcklwd m6, m1        ; ABBCCDDE
540
    punpcklwd m2, m3        ; CDDEEFFG
541
    punpcklwd m4, m5        ; EFFGGHHI
542
    pmaddwd   m6, [r5-48]
543
    pmaddwd   m2, [r5-32]
544
    pmaddwd   m4, [r5-16]
545
    paddd     m6, m2
546
    paddd     m6, m4
547

    
548
    packssdw  m0, m6
549
    paddsw    m0, [pw_64]
550
    psraw     m0, 7
551
    packuswb  m0, m7
552
    movh    [r0], m0        ; store
553

    
554
    ; go to next line
555
    add       r0, r1
556
    add       r2, r3
557
    dec       r4            ; next row
558
    jg .nextrow
559
    REP_RET
560

    
561
%macro FILTER_V 3
562
; 4x4 block, V-only 4-tap filter
563
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
564
    shl      r6d, 5
565
%ifdef PIC
566
    lea      r11, [fourtap_filter_v_m]
567
%endif
568
    lea       r6, [fourtap_filter_v+r6-32]
569
    mova      m6, [pw_64]
570
    pxor      m7, m7
571
    mova      m5, [r6+48]
572

    
573
    ; read 3 lines
574
    sub       r2, r3
575
    movh      m0, [r2]
576
    movh      m1, [r2+  r3]
577
    movh      m2, [r2+2*r3]
578
    add       r2, r3
579
    punpcklbw m0, m7
580
    punpcklbw m1, m7
581
    punpcklbw m2, m7
582

    
583
.nextrow
584
    ; first calculate negative taps (to prevent losing positive overflows)
585
    movh      m4, [r2+2*r3]                ; read new row
586
    punpcklbw m4, m7
587
    mova      m3, m4
588
    pmullw    m0, [r6+0]
589
    pmullw    m4, m5
590
    paddsw    m4, m0
591

    
592
    ; then calculate positive taps
593
    mova      m0, m1
594
    pmullw    m1, [r6+16]
595
    paddsw    m4, m1
596
    mova      m1, m2
597
    pmullw    m2, [r6+32]
598
    paddsw    m4, m2
599
    mova      m2, m3
600

    
601
    ; round/clip/store
602
    paddsw    m4, m6
603
    psraw     m4, 7
604
    packuswb  m4, m7
605
    movh    [r0], m4
606

    
607
    ; go to next line
608
    add       r0, r1
609
    add       r2, r3
610
    dec       r4                           ; next row
611
    jg .nextrow
612
    REP_RET
613

    
614

    
615
; 4x4 block, V-only 6-tap filter
616
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
617
    shl      r6d, 4
618
    lea       r6, [r6*3]
619
%ifdef PIC
620
    lea      r11, [sixtap_filter_v_m]
621
%endif
622
    lea       r6, [sixtap_filter_v+r6-96]
623
    pxor      m7, m7
624

    
625
    ; read 5 lines
626
    sub       r2, r3
627
    sub       r2, r3
628
    movh      m0, [r2]
629
    movh      m1, [r2+r3]
630
    movh      m2, [r2+r3*2]
631
    lea       r2, [r2+r3*2]
632
    add       r2, r3
633
    movh      m3, [r2]
634
    movh      m4, [r2+r3]
635
    punpcklbw m0, m7
636
    punpcklbw m1, m7
637
    punpcklbw m2, m7
638
    punpcklbw m3, m7
639
    punpcklbw m4, m7
640

    
641
.nextrow
642
    ; first calculate negative taps (to prevent losing positive overflows)
643
    mova      m5, m1
644
    pmullw    m5, [r6+16]
645
    mova      m6, m4
646
    pmullw    m6, [r6+64]
647
    paddsw    m6, m5
648

    
649
    ; then calculate positive taps
650
    movh      m5, [r2+2*r3]                ; read new row
651
    punpcklbw m5, m7
652
    pmullw    m0, [r6+0]
653
    paddsw    m6, m0
654
    mova      m0, m1
655
    mova      m1, m2
656
    pmullw    m2, [r6+32]
657
    paddsw    m6, m2
658
    mova      m2, m3
659
    pmullw    m3, [r6+48]
660
    paddsw    m6, m3
661
    mova      m3, m4
662
    mova      m4, m5
663
    pmullw    m5, [r6+80]
664
    paddsw    m6, m5
665

    
666
    ; round/clip/store
667
    paddsw    m6, [pw_64]
668
    psraw     m6, 7
669
    packuswb  m6, m7
670
    movh    [r0], m6
671

    
672
    ; go to next line
673
    add       r0, r1
674
    add       r2, r3
675
    dec       r4                           ; next row
676
    jg .nextrow
677
    REP_RET
678
%endmacro
679

    
680
INIT_MMX
681
FILTER_V mmxext, 4, 0
682
INIT_XMM
683
FILTER_V sse2,   8, 8
684

    
685
%macro FILTER_BILINEAR 3
686
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
687
    mov      r5d, 8*16
688
    shl      r6d, 4
689
    sub      r5d, r6d
690
%ifdef PIC
691
    lea      r11, [bilinear_filter_vw_m]
692
%endif
693
    pxor      m6, m6
694
    mova      m4, [bilinear_filter_vw+r5-16]
695
    mova      m5, [bilinear_filter_vw+r6-16]
696
.nextrow
697
    movh      m0, [r2+r3*0]
698
    movh      m1, [r2+r3*1]
699
    movh      m3, [r2+r3*2]
700
    punpcklbw m0, m6
701
    punpcklbw m1, m6
702
    punpcklbw m3, m6
703
    mova      m2, m1
704
    pmullw    m0, m4
705
    pmullw    m1, m5
706
    pmullw    m2, m4
707
    pmullw    m3, m5
708
    paddsw    m0, m1
709
    paddsw    m2, m3
710
    psraw     m0, 2
711
    psraw     m2, 2
712
    pavgw     m0, m6
713
    pavgw     m2, m6
714
%ifidn %1, mmxext
715
    packuswb  m0, m0
716
    packuswb  m2, m2
717
    movh [r0+r1*0], m0
718
    movh [r0+r1*1], m2
719
%else
720
    packuswb  m0, m2
721
    movh   [r0+r1*0], m0
722
    movhps [r0+r1*1], m0
723
%endif
724

    
725
    lea       r0, [r0+r1*2]
726
    lea       r2, [r2+r3*2]
727
    sub       r4, 2
728
    jg .nextrow
729
    REP_RET
730

    
731
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
732
    mov      r6d, 8*16
733
    shl      r5d, 4
734
    sub      r6d, r5d
735
%ifdef PIC
736
    lea      r11, [bilinear_filter_vw_m]
737
%endif
738
    pxor      m6, m6
739
    mova      m4, [bilinear_filter_vw+r6-16]
740
    mova      m5, [bilinear_filter_vw+r5-16]
741
.nextrow
742
    movh      m0, [r2+r3*0+0]
743
    movh      m1, [r2+r3*0+1]
744
    movh      m2, [r2+r3*1+0]
745
    movh      m3, [r2+r3*1+1]
746
    punpcklbw m0, m6
747
    punpcklbw m1, m6
748
    punpcklbw m2, m6
749
    punpcklbw m3, m6
750
    pmullw    m0, m4
751
    pmullw    m1, m5
752
    pmullw    m2, m4
753
    pmullw    m3, m5
754
    paddsw    m0, m1
755
    paddsw    m2, m3
756
    psraw     m0, 2
757
    psraw     m2, 2
758
    pavgw     m0, m6
759
    pavgw     m2, m6
760
%ifidn %1, mmxext
761
    packuswb  m0, m0
762
    packuswb  m2, m2
763
    movh [r0+r1*0], m0
764
    movh [r0+r1*1], m2
765
%else
766
    packuswb  m0, m2
767
    movh   [r0+r1*0], m0
768
    movhps [r0+r1*1], m0
769
%endif
770

    
771
    lea       r0, [r0+r1*2]
772
    lea       r2, [r2+r3*2]
773
    sub       r4, 2
774
    jg .nextrow
775
    REP_RET
776
%endmacro
777

    
778
INIT_MMX
779
FILTER_BILINEAR mmxext, 4, 0
780
INIT_XMM
781
FILTER_BILINEAR   sse2, 8, 7
782

    
783
%macro FILTER_BILINEAR_SSSE3 1
784
cglobal put_vp8_bilinear%1_v_ssse3, 7,7
785
    shl      r6d, 4
786
%ifdef PIC
787
    lea      r11, [bilinear_filter_vb_m]
788
%endif
789
    pxor      m4, m4
790
    mova      m3, [bilinear_filter_vb+r6-16]
791
.nextrow
792
    movh      m0, [r2+r3*0]
793
    movh      m1, [r2+r3*1]
794
    movh      m2, [r2+r3*2]
795
    punpcklbw m0, m1
796
    punpcklbw m1, m2
797
    pmaddubsw m0, m3
798
    pmaddubsw m1, m3
799
    psraw     m0, 2
800
    psraw     m1, 2
801
    pavgw     m0, m4
802
    pavgw     m1, m4
803
%if mmsize==8
804
    packuswb  m0, m0
805
    packuswb  m1, m1
806
    movh [r0+r1*0], m0
807
    movh [r0+r1*1], m1
808
%else
809
    packuswb  m0, m1
810
    movh   [r0+r1*0], m0
811
    movhps [r0+r1*1], m0
812
%endif
813

    
814
    lea       r0, [r0+r1*2]
815
    lea       r2, [r2+r3*2]
816
    sub       r4, 2
817
    jg .nextrow
818
    REP_RET
819

    
820
cglobal put_vp8_bilinear%1_h_ssse3, 7,7
821
    shl      r5d, 4
822
%ifdef PIC
823
    lea      r11, [bilinear_filter_vb_m]
824
%endif
825
    pxor      m4, m4
826
    mova      m2, [filter_h2_shuf]
827
    mova      m3, [bilinear_filter_vb+r5-16]
828
.nextrow
829
    movu      m0, [r2+r3*0]
830
    movu      m1, [r2+r3*1]
831
    pshufb    m0, m2
832
    pshufb    m1, m2
833
    pmaddubsw m0, m3
834
    pmaddubsw m1, m3
835
    psraw     m0, 2
836
    psraw     m1, 2
837
    pavgw     m0, m4
838
    pavgw     m1, m4
839
%if mmsize==8
840
    packuswb  m0, m0
841
    packuswb  m1, m1
842
    movh [r0+r1*0], m0
843
    movh [r0+r1*1], m1
844
%else
845
    packuswb  m0, m1
846
    movh   [r0+r1*0], m0
847
    movhps [r0+r1*1], m0
848
%endif
849

    
850
    lea       r0, [r0+r1*2]
851
    lea       r2, [r2+r3*2]
852
    sub       r4, 2
853
    jg .nextrow
854
    REP_RET
855
%endmacro
856

    
857
INIT_MMX
858
FILTER_BILINEAR_SSSE3 4
859
INIT_XMM
860
FILTER_BILINEAR_SSSE3 8
861

    
862
cglobal put_vp8_pixels8_mmx, 5,5
863
.nextrow:
864
    movq  mm0, [r2+r3*0]
865
    movq  mm1, [r2+r3*1]
866
    lea    r2, [r2+r3*2]
867
    movq [r0+r1*0], mm0
868
    movq [r0+r1*1], mm1
869
    lea    r0, [r0+r1*2]
870
    sub   r4d, 2
871
    jg .nextrow
872
    REP_RET
873

    
874
cglobal put_vp8_pixels16_mmx, 5,5
875
.nextrow:
876
    movq  mm0, [r2+r3*0+0]
877
    movq  mm1, [r2+r3*0+8]
878
    movq  mm2, [r2+r3*1+0]
879
    movq  mm3, [r2+r3*1+8]
880
    lea    r2, [r2+r3*2]
881
    movq [r0+r1*0+0], mm0
882
    movq [r0+r1*0+8], mm1
883
    movq [r0+r1*1+0], mm2
884
    movq [r0+r1*1+8], mm3
885
    lea    r0, [r0+r1*2]
886
    sub   r4d, 2
887
    jg .nextrow
888
    REP_RET
889

    
890
cglobal put_vp8_pixels16_sse, 5,5,2
891
.nextrow:
892
    movups xmm0, [r2+r3*0]
893
    movups xmm1, [r2+r3*1]
894
    lea     r2, [r2+r3*2]
895
    movaps [r0+r1*0], xmm0
896
    movaps [r0+r1*1], xmm1
897
    lea     r0, [r0+r1*2]
898
    sub    r4d, 2
899
    jg .nextrow
900
    REP_RET
901

    
902
;-----------------------------------------------------------------------------
903
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
904
;-----------------------------------------------------------------------------
905

    
906
%macro ADD_DC 4
907
    %4        m2, [r0+%3]
908
    %4        m3, [r0+r2+%3]
909
    %4        m4, [r1+%3]
910
    %4        m5, [r1+r2+%3]
911
    paddusb   m2, %1
912
    paddusb   m3, %1
913
    paddusb   m4, %1
914
    paddusb   m5, %1
915
    psubusb   m2, %2
916
    psubusb   m3, %2
917
    psubusb   m4, %2
918
    psubusb   m5, %2
919
    %4    [r0+%3], m2
920
    %4 [r0+r2+%3], m3
921
    %4    [r1+%3], m4
922
    %4 [r1+r2+%3], m5
923
%endmacro
924

    
925
INIT_MMX
926
cglobal vp8_idct_dc_add_mmx, 3, 3
927
    ; load data
928
    movd       m0, [r1]
929

    
930
    ; calculate DC
931
    paddw      m0, [pw_4]
932
    pxor       m1, m1
933
    psraw      m0, 3
934
    movd      [r1], m1
935
    psubw      m1, m0
936
    packuswb   m0, m0
937
    packuswb   m1, m1
938
    punpcklbw  m0, m0
939
    punpcklbw  m1, m1
940
    punpcklwd  m0, m0
941
    punpcklwd  m1, m1
942

    
943
    ; add DC
944
    lea        r1, [r0+r2*2]
945
    ADD_DC     m0, m1, 0, movh
946
    RET
947

    
948
INIT_XMM
949
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
950
    ; load data
951
    movd       m0, [r1]
952
    pxor       m1, m1
953

    
954
    ; calculate DC
955
    paddw      m0, [pw_4]
956
    movd     [r1], m1
957
    lea        r1, [r0+r2*2]
958
    movd       m2, [r0]
959
    movd       m3, [r0+r2]
960
    movd       m4, [r1]
961
    movd       m5, [r1+r2]
962
    psraw      m0, 3
963
    pshuflw    m0, m0, 0
964
    punpcklqdq m0, m0
965
    punpckldq  m2, m3
966
    punpckldq  m4, m5
967
    punpcklbw  m2, m1
968
    punpcklbw  m4, m1
969
    paddw      m2, m0
970
    paddw      m4, m0
971
    packuswb   m2, m4
972
    movd      [r0], m2
973
    pextrd [r0+r2], m2, 1
974
    pextrd    [r1], m2, 2
975
    pextrd [r1+r2], m2, 3
976
    RET
977

    
978
;-----------------------------------------------------------------------------
979
; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
980
;-----------------------------------------------------------------------------
981

    
982
INIT_MMX
983
cglobal vp8_idct_dc_add4y_mmx, 3, 3
984
    ; load data
985
    movd      m0, [r1+32*0] ; A
986
    movd      m1, [r1+32*2] ; C
987
    punpcklwd m0, [r1+32*1] ; A B
988
    punpcklwd m1, [r1+32*3] ; C D
989
    punpckldq m0, m1        ; A B C D
990
    pxor      m6, m6
991

    
992
    ; calculate DC
993
    paddw     m0, [pw_4]
994
    movd [r1+32*0], m6
995
    movd [r1+32*1], m6
996
    movd [r1+32*2], m6
997
    movd [r1+32*3], m6
998
    psraw     m0, 3
999
    psubw     m6, m0
1000
    packuswb  m0, m0
1001
    packuswb  m6, m6
1002
    punpcklbw m0, m0 ; AABBCCDD
1003
    punpcklbw m6, m6 ; AABBCCDD
1004
    movq      m1, m0
1005
    movq      m7, m6
1006
    punpcklbw m0, m0 ; AAAABBBB
1007
    punpckhbw m1, m1 ; CCCCDDDD
1008
    punpcklbw m6, m6 ; AAAABBBB
1009
    punpckhbw m7, m7 ; CCCCDDDD
1010

    
1011
    ; add DC
1012
    lea       r1, [r0+r2*2]
1013
    ADD_DC    m0, m6, 0, mova
1014
    ADD_DC    m1, m7, 8, mova
1015
    RET
1016

    
1017
INIT_XMM
1018
cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
1019
    ; load data
1020
    movd      m0, [r1+32*0] ; A
1021
    movd      m1, [r1+32*2] ; C
1022
    punpcklwd m0, [r1+32*1] ; A B
1023
    punpcklwd m1, [r1+32*3] ; C D
1024
    punpckldq m0, m1        ; A B C D
1025
    pxor      m1, m1
1026

    
1027
    ; calculate DC
1028
    paddw     m0, [pw_4]
1029
    movd [r1+32*0], m1
1030
    movd [r1+32*1], m1
1031
    movd [r1+32*2], m1
1032
    movd [r1+32*3], m1
1033
    psraw     m0, 3
1034
    psubw     m1, m0
1035
    packuswb  m0, m0
1036
    packuswb  m1, m1
1037
    punpcklbw m0, m0
1038
    punpcklbw m1, m1
1039
    punpcklbw m0, m0
1040
    punpcklbw m1, m1
1041

    
1042
    ; add DC
1043
    lea       r1, [r0+r2*2]
1044
    ADD_DC    m0, m1, 0, mova
1045
    RET
1046

    
1047
;-----------------------------------------------------------------------------
1048
; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
1049
;-----------------------------------------------------------------------------
1050

    
1051
INIT_MMX
1052
cglobal vp8_idct_dc_add4uv_mmx, 3, 3
1053
    ; load data
1054
    movd      m0, [r1+32*0] ; A
1055
    movd      m1, [r1+32*2] ; C
1056
    punpcklwd m0, [r1+32*1] ; A B
1057
    punpcklwd m1, [r1+32*3] ; C D
1058
    punpckldq m0, m1        ; A B C D
1059
    pxor      m6, m6
1060

    
1061
    ; calculate DC
1062
    paddw     m0, [pw_4]
1063
    movd [r1+32*0], m6
1064
    movd [r1+32*1], m6
1065
    movd [r1+32*2], m6
1066
    movd [r1+32*3], m6
1067
    psraw     m0, 3
1068
    psubw     m6, m0
1069
    packuswb  m0, m0
1070
    packuswb  m6, m6
1071
    punpcklbw m0, m0 ; AABBCCDD
1072
    punpcklbw m6, m6 ; AABBCCDD
1073
    movq      m1, m0
1074
    movq      m7, m6
1075
    punpcklbw m0, m0 ; AAAABBBB
1076
    punpckhbw m1, m1 ; CCCCDDDD
1077
    punpcklbw m6, m6 ; AAAABBBB
1078
    punpckhbw m7, m7 ; CCCCDDDD
1079

    
1080
    ; add DC
1081
    lea       r1, [r0+r2*2]
1082
    ADD_DC    m0, m6, 0, mova
1083
    lea       r0, [r0+r2*4]
1084
    lea       r1, [r1+r2*4]
1085
    ADD_DC    m1, m7, 0, mova
1086
    RET
1087

    
1088
;-----------------------------------------------------------------------------
1089
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
1090
;-----------------------------------------------------------------------------
1091

    
1092
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
1093
;           this macro assumes that m6/m7 have words for 20091/17734 loaded
1094
%macro VP8_MULTIPLY_SUMSUB 4
1095
    mova      %3, %1
1096
    mova      %4, %2
1097
    pmulhw    %3, m6 ;20091(1)
1098
    pmulhw    %4, m6 ;20091(2)
1099
    paddw     %3, %1
1100
    paddw     %4, %2
1101
    paddw     %1, %1
1102
    paddw     %2, %2
1103
    pmulhw    %1, m7 ;35468(1)
1104
    pmulhw    %2, m7 ;35468(2)
1105
    psubw     %1, %4
1106
    paddw     %2, %3
1107
%endmacro
1108

    
1109
; calculate x0=%1+%3; x1=%1-%3
1110
;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
1111
;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
1112
;           %5/%6 are temporary registers
1113
;           we assume m6/m7 have constant words 20091/17734 loaded in them
1114
%macro VP8_IDCT_TRANSFORM4x4_1D 6
1115
    SUMSUB_BA           m%3, m%1, m%5     ;t0, t1
1116
    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
1117
    SUMSUB_BA           m%4, m%3, m%5     ;tmp0, tmp3
1118
    SUMSUB_BA           m%2, m%1, m%5     ;tmp1, tmp2
1119
    SWAP                 %4,  %1
1120
    SWAP                 %4,  %3
1121
%endmacro
1122

    
1123
INIT_MMX
1124
%macro VP8_IDCT_ADD 1
1125
cglobal vp8_idct_add_%1, 3, 3
1126
    ; load block data
1127
    movq         m0, [r1+ 0]
1128
    movq         m1, [r1+ 8]
1129
    movq         m2, [r1+16]
1130
    movq         m3, [r1+24]
1131
    movq         m6, [pw_20091]
1132
    movq         m7, [pw_17734]
1133
%ifidn %1, sse
1134
    xorps      xmm0, xmm0
1135
    movaps  [r1+ 0], xmm0
1136
    movaps  [r1+16], xmm0
1137
%else
1138
    pxor         m4, m4
1139
    movq    [r1+ 0], m4
1140
    movq    [r1+ 8], m4
1141
    movq    [r1+16], m4
1142
    movq    [r1+24], m4
1143
%endif
1144

    
1145
    ; actual IDCT
1146
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1147
    TRANSPOSE4x4W            0, 1, 2, 3, 4
1148
    paddw        m0, [pw_4]
1149
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1150
    TRANSPOSE4x4W            0, 1, 2, 3, 4
1151

    
1152
    ; store
1153
    pxor         m4, m4
1154
    lea          r1, [r0+2*r2]
1155
    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
1156
    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
1157

    
1158
    RET
1159
%endmacro
1160

    
1161
VP8_IDCT_ADD mmx
1162
VP8_IDCT_ADD sse
1163

    
1164
;-----------------------------------------------------------------------------
1165
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
1166
;-----------------------------------------------------------------------------
1167

    
1168
%macro SCATTER_WHT 3
1169
    movd  r1d, m%1
1170
    movd  r2d, m%2
1171
    mov [r0+2*16*(0+%3)], r1w
1172
    mov [r0+2*16*(1+%3)], r2w
1173
    shr   r1d, 16
1174
    shr   r2d, 16
1175
    psrlq m%1, 32
1176
    psrlq m%2, 32
1177
    mov [r0+2*16*(4+%3)], r1w
1178
    mov [r0+2*16*(5+%3)], r2w
1179
    movd  r1d, m%1
1180
    movd  r2d, m%2
1181
    mov [r0+2*16*(8+%3)], r1w
1182
    mov [r0+2*16*(9+%3)], r2w
1183
    shr   r1d, 16
1184
    shr   r2d, 16
1185
    mov [r0+2*16*(12+%3)], r1w
1186
    mov [r0+2*16*(13+%3)], r2w
1187
%endmacro
1188

    
1189
%macro HADAMARD4_1D 4
1190
    SUMSUB_BADC m%2, m%1, m%4, m%3
1191
    SUMSUB_BADC m%4, m%2, m%3, m%1
1192
    SWAP %1, %4, %3
1193
%endmacro
1194

    
1195
INIT_MMX
1196
cglobal vp8_luma_dc_wht_mmx, 2,3
1197
    movq          m0, [r1]
1198
    movq          m1, [r1+8]
1199
    movq          m2, [r1+16]
1200
    movq          m3, [r1+24]
1201
    HADAMARD4_1D  0, 1, 2, 3
1202
    TRANSPOSE4x4W 0, 1, 2, 3, 4
1203
    paddw         m0, [pw_3]
1204
    HADAMARD4_1D  0, 1, 2, 3
1205
    psraw         m0, 3
1206
    psraw         m1, 3
1207
    psraw         m2, 3
1208
    psraw         m3, 3
1209
    SCATTER_WHT   0, 1, 0
1210
    SCATTER_WHT   2, 3, 2
1211
    RET
1212

    
1213
;-----------------------------------------------------------------------------
1214
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
1215
;-----------------------------------------------------------------------------
1216

    
1217
; macro called with 7 mm register indexes as argument, and 4 regular registers
1218
;
1219
; first 4 mm registers will carry the transposed pixel data
1220
; the other three are scratchspace (one would be sufficient, but this allows
1221
; for more spreading/pipelining and thus faster execution on OOE CPUs)
1222
;
1223
; first two regular registers are buf+4*stride and buf+5*stride
1224
; third is -stride, fourth is +stride
1225
%macro READ_8x4_INTERLEAVED 11
1226
    ; interleave 8 (A-H) rows of 4 pixels each
1227
    movd          m%1, [%8+%10*4]   ; A0-3
1228
    movd          m%5, [%9+%10*4]   ; B0-3
1229
    movd          m%2, [%8+%10*2]   ; C0-3
1230
    movd          m%6, [%8+%10]     ; D0-3
1231
    movd          m%3, [%8]         ; E0-3
1232
    movd          m%7, [%9]         ; F0-3
1233
    movd          m%4, [%9+%11]     ; G0-3
1234
    punpcklbw     m%1, m%5          ; A/B interleaved
1235
    movd          m%5, [%9+%11*2]   ; H0-3
1236
    punpcklbw     m%2, m%6          ; C/D interleaved
1237
    punpcklbw     m%3, m%7          ; E/F interleaved
1238
    punpcklbw     m%4, m%5          ; G/H interleaved
1239
%endmacro
1240

    
1241
; macro called with 7 mm register indexes as argument, and 5 regular registers
1242
; first 11 mean the same as READ_8x4_TRANSPOSED above
1243
; fifth regular register is scratchspace to reach the bottom 8 rows, it
1244
; will be set to second regular register + 8*stride at the end
1245
%macro READ_16x4_INTERLEAVED 12
1246
    ; transpose 16 (A-P) rows of 4 pixels each
1247
    lea           %12, [r0+8*r2]
1248

    
1249
    ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
1250
    movd          m%1, [%8+%10*4]   ; A0-3
1251
    movd          m%3, [%12+%10*4]  ; I0-3
1252
    movd          m%2, [%8+%10*2]   ; C0-3
1253
    movd          m%4, [%12+%10*2]  ; K0-3
1254
    movd          m%6, [%8+%10]     ; D0-3
1255
    movd          m%5, [%12+%10]    ; L0-3
1256
    movd          m%7, [%12]        ; M0-3
1257
    add           %12, %11
1258
    punpcklbw     m%1, m%3          ; A/I
1259
    movd          m%3, [%8]         ; E0-3
1260
    punpcklbw     m%2, m%4          ; C/K
1261
    punpcklbw     m%6, m%5          ; D/L
1262
    punpcklbw     m%3, m%7          ; E/M
1263
    punpcklbw     m%2, m%6          ; C/D/K/L interleaved
1264

    
1265
    ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
1266
    movd         m%5, [%9+%10*4]   ; B0-3
1267
    movd         m%4, [%12+%10*4]  ; J0-3
1268
    movd         m%7, [%9]         ; F0-3
1269
    movd         m%6, [%12]        ; N0-3
1270
    punpcklbw    m%5, m%4          ; B/J
1271
    punpcklbw    m%7, m%6          ; F/N
1272
    punpcklbw    m%1, m%5          ; A/B/I/J interleaved
1273
    punpcklbw    m%3, m%7          ; E/F/M/N interleaved
1274
    movd         m%4, [%9+%11]     ; G0-3
1275
    movd         m%6, [%12+%11]    ; O0-3
1276
    movd         m%5, [%9+%11*2]   ; H0-3
1277
    movd         m%7, [%12+%11*2]  ; P0-3
1278
    punpcklbw    m%4, m%6          ; G/O
1279
    punpcklbw    m%5, m%7          ; H/P
1280
    punpcklbw    m%4, m%5          ; G/H/O/P interleaved
1281
%endmacro
1282

    
1283
; write 4 mm registers of 2 dwords each
1284
; first four arguments are mm register indexes containing source data
1285
; last four are registers containing buf+4*stride, buf+5*stride,
1286
; -stride and +stride
1287
%macro WRITE_4x2D 8
1288
    ; write out (2 dwords per register)
1289
    movd    [%5+%7*4], m%1
1290
    movd    [%5+%7*2], m%2
1291
    movd         [%5], m%3
1292
    movd      [%6+%8], m%4
1293
    punpckhdq     m%1, m%1
1294
    punpckhdq     m%2, m%2
1295
    punpckhdq     m%3, m%3
1296
    punpckhdq     m%4, m%4
1297
    movd    [%6+%7*4], m%1
1298
    movd      [%5+%7], m%2
1299
    movd         [%6], m%3
1300
    movd    [%6+%8*2], m%4
1301
%endmacro
1302

    
1303
; write 4 xmm registers of 4 dwords each
1304
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
1305
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
1306
; we add 1*stride to the third regular registry in the process
1307
; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
1308
; same memory region), or 8 if they cover two separate buffers (third one points to
1309
; a different memory region than the first two), allowing for more optimal code for
1310
; the 16-width case
1311
%macro WRITE_4x4D 10
1312
    ; write out (4 dwords per register), start with dwords zero
1313
    movd    [%5+%8*4], m%1
1314
    movd         [%5], m%2
1315
    movd    [%7+%8*4], m%3
1316
    movd         [%7], m%4
1317

    
1318
    ; store dwords 1
1319
    psrldq        m%1, 4
1320
    psrldq        m%2, 4
1321
    psrldq        m%3, 4
1322
    psrldq        m%4, 4
1323
    movd    [%6+%8*4], m%1
1324
    movd         [%6], m%2
1325
%if %10 == 16
1326
    movd    [%6+%9*4], m%3
1327
%endif
1328
    movd      [%7+%9], m%4
1329

    
1330
    ; write dwords 2
1331
    psrldq        m%1, 4
1332
    psrldq        m%2, 4
1333
%if %10 == 8
1334
    movd    [%5+%8*2], m%1
1335
    movd           %5, m%3
1336
%endif
1337
    psrldq        m%3, 4
1338
    psrldq        m%4, 4
1339
%if %10 == 16
1340
    movd    [%5+%8*2], m%1
1341
%endif
1342
    movd      [%6+%9], m%2
1343
    movd    [%7+%8*2], m%3
1344
    movd    [%7+%9*2], m%4
1345
    add            %7, %9
1346

    
1347
    ; store dwords 3
1348
    psrldq        m%1, 4
1349
    psrldq        m%2, 4
1350
    psrldq        m%3, 4
1351
    psrldq        m%4, 4
1352
%if %10 == 8
1353
    mov     [%7+%8*4], %5d
1354
    movd    [%6+%8*2], m%1
1355
%else
1356
    movd      [%5+%8], m%1
1357
%endif
1358
    movd    [%6+%9*2], m%2
1359
    movd    [%7+%8*2], m%3
1360
    movd    [%7+%9*2], m%4
1361
%endmacro
1362

    
1363
%macro SPLATB_REG 3-4
1364
    movd           %1, %2
1365
%ifidn %3, ssse3
1366
    pshufb         %1, %4
1367
%else
1368
    punpcklbw      %1, %1
1369
%if mmsize == 16 ; sse2
1370
    pshuflw        %1, %1, 0x0
1371
    punpcklqdq     %1, %1
1372
%elifidn %3, mmx
1373
    punpcklwd      %1, %1
1374
    punpckldq      %1, %1
1375
%else ; mmxext
1376
    pshufw         %1, %1, 0x0
1377
%endif
1378
%endif
1379
%endmacro
1380

    
1381
%macro SIMPLE_LOOPFILTER 3
1382
cglobal vp8_%2_loop_filter_simple_%1, 3, %3
1383
%ifidn %2, h
1384
    mov            r5, rsp          ; backup stack pointer
1385
    and           rsp, ~(mmsize-1)  ; align stack
1386
%endif
1387
%if mmsize == 8 ; mmx/mmxext
1388
    mov            r3, 2
1389
%endif
1390
%ifidn %1, ssse3
1391
    pxor           m0, m0
1392
%endif
1393
    SPLATB_REG     m7, r2, %1, m0   ; splat "flim" into register
1394

    
1395
    ; set up indexes to address 4 rows
1396
    mov            r2, r1
1397
    neg            r1
1398
%ifidn %2, h
1399
    lea            r0, [r0+4*r2-2]
1400
    sub           rsp, mmsize*2     ; (aligned) storage space for saving p1/q1
1401
%endif
1402

    
1403
%if mmsize == 8 ; mmx / mmxext
1404
.next8px
1405
%endif
1406
%ifidn %2, v
1407
    ; read 4 half/full rows of pixels
1408
    mova           m0, [r0+r1*2]    ; p1
1409
    mova           m1, [r0+r1]      ; p0
1410
    mova           m2, [r0]         ; q0
1411
    mova           m3, [r0+r2]      ; q1
1412
%else ; h
1413
    lea            r4, [r0+r2]
1414

    
1415
%if mmsize == 8 ; mmx/mmxext
1416
    READ_8x4_INTERLEAVED  0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
1417
%else ; sse2
1418
    READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
1419
%endif
1420
    TRANSPOSE4x4W         0, 1, 2, 3, 4
1421

    
1422
    mova        [rsp], m0           ; store p1
1423
    mova [rsp+mmsize], m3           ; store q1
1424
%endif
1425

    
1426
    ; simple_limit
1427
    mova           m5, m2           ; m5=backup of q0
1428
    mova           m6, m1           ; m6=backup of p0
1429
    psubusb        m1, m2           ; p0-q0
1430
    psubusb        m2, m6           ; q0-p0
1431
    por            m1, m2           ; FFABS(p0-q0)
1432
    paddusb        m1, m1           ; m1=FFABS(p0-q0)*2
1433

    
1434
    mova           m4, m3
1435
    mova           m2, m0
1436
    psubusb        m3, m0           ; q1-p1
1437
    psubusb        m0, m4           ; p1-q1
1438
    por            m3, m0           ; FFABS(p1-q1)
1439
    mova           m0, [pb_80]
1440
    pxor           m2, m0
1441
    pxor           m4, m0
1442
    psubsb         m2, m4           ; m2=p1-q1 (signed) backup for below
1443
    pand           m3, [pb_FE]
1444
    psrlq          m3, 1            ; m3=FFABS(p1-q1)/2, this can be used signed
1445
    paddusb        m3, m1
1446
    psubusb        m3, m7
1447
    pxor           m1, m1
1448
    pcmpeqb        m3, m1           ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
1449

    
1450
    ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
1451
    mova           m4, m5
1452
    pxor           m5, m0
1453
    pxor           m0, m6
1454
    psubsb         m5, m0           ; q0-p0 (signed)
1455
    paddsb         m2, m5
1456
    paddsb         m2, m5
1457
    paddsb         m2, m5           ; a=(p1-q1) + 3*(q0-p0)
1458
    pand           m2, m3           ; apply filter mask (m3)
1459

    
1460
    mova           m3, [pb_F8]
1461
    mova           m1, m2
1462
    paddsb         m2, [pb_4]       ; f1<<3=a+4
1463
    paddsb         m1, [pb_3]       ; f2<<3=a+3
1464
    pand           m2, m3
1465
    pand           m1, m3           ; cache f2<<3
1466

    
1467
    pxor           m0, m0
1468
    pxor           m3, m3
1469
    pcmpgtb        m0, m2           ; which values are <0?
1470
    psubb          m3, m2           ; -f1<<3
1471
    psrlq          m2, 3            ; +f1
1472
    psrlq          m3, 3            ; -f1
1473
    pand           m3, m0
1474
    pandn          m0, m2
1475
    psubusb        m4, m0
1476
    paddusb        m4, m3           ; q0-f1
1477

    
1478
    pxor           m0, m0
1479
    pxor           m3, m3
1480
    pcmpgtb        m0, m1           ; which values are <0?
1481
    psubb          m3, m1           ; -f2<<3
1482
    psrlq          m1, 3            ; +f2
1483
    psrlq          m3, 3            ; -f2
1484
    pand           m3, m0
1485
    pandn          m0, m1
1486
    paddusb        m6, m0
1487
    psubusb        m6, m3           ; p0+f2
1488

    
1489
    ; store
1490
%ifidn %2, v
1491
    mova         [r0], m4
1492
    mova      [r0+r1], m6
1493
%else ; h
1494
    mova           m0, [rsp]        ; p1
1495
    SWAP            2, 4            ; p0
1496
    SWAP            1, 6            ; q0
1497
    mova           m3, [rsp+mmsize] ; q1
1498

    
1499
    TRANSPOSE4x4B  0, 1, 2, 3, 4
1500
%if mmsize == 16 ; sse2
1501
    add            r3, r1           ; change from r4*8*stride to r0+8*stride
1502
    WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16
1503
%else ; mmx/mmxext
1504
    WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2
1505
%endif
1506
%endif
1507

    
1508
%if mmsize == 8 ; mmx/mmxext
1509
    ; next 8 pixels
1510
%ifidn %2, v
1511
    add            r0, 8            ; advance 8 cols = pixels
1512
%else ; h
1513
    lea            r0, [r0+r2*8]    ; advance 8 rows = lines
1514
%endif
1515
    dec            r3
1516
    jg .next8px
1517
%ifidn %2, v
1518
    REP_RET
1519
%else ; h
1520
    mov           rsp, r5           ; restore stack pointer
1521
    RET
1522
%endif
1523
%else ; sse2
1524
%ifidn %2, h
1525
    mov           rsp, r5           ; restore stack pointer
1526
%endif
1527
    RET
1528
%endif
1529
%endmacro
1530

    
1531
INIT_MMX
1532
SIMPLE_LOOPFILTER mmx,    v, 4
1533
SIMPLE_LOOPFILTER mmx,    h, 6
1534
SIMPLE_LOOPFILTER mmxext, v, 4
1535
SIMPLE_LOOPFILTER mmxext, h, 6
1536
INIT_XMM
1537
SIMPLE_LOOPFILTER sse2,   v, 3
1538
SIMPLE_LOOPFILTER sse2,   h, 6
1539
SIMPLE_LOOPFILTER ssse3,  v, 3
1540
SIMPLE_LOOPFILTER ssse3,  h, 6
1541

    
1542
;-----------------------------------------------------------------------------
1543
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
1544
;                                            int flimE, int flimI, int hev_thr);
1545
;-----------------------------------------------------------------------------
1546

    
1547
%macro INNER_LOOPFILTER 5
1548
%if %4 == 8 ; chroma
1549
cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5
1550
%define dst8_reg    r1
1551
%define mstride_reg r2
1552
%define E_reg       r3
1553
%define I_reg       r4
1554
%define hev_thr_reg r5
1555
%else ; luma
1556
cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
1557
%define mstride_reg r1
1558
%define E_reg       r2
1559
%define I_reg       r3
1560
%define hev_thr_reg r4
1561
%ifdef m8 ; x86-64, sse2
1562
%define dst8_reg    r4
1563
%elif mmsize == 16 ; x86-32, sse2
1564
%define dst8_reg    r5
1565
%else ; x86-32, mmx/mmxext
1566
%define cnt_reg     r5
1567
%endif
1568
%endif
1569
%define dst_reg     r0
1570
%define stride_reg  E_reg
1571
%define dst2_reg    I_reg
1572
%ifndef m8
1573
%define stack_reg   hev_thr_reg
1574
%endif
1575

    
1576
%ifidn %1, ssse3
1577
    pxor             m7, m7
1578
%endif
1579

    
1580
%ifndef m8 ; mmx/mmxext or sse2 on x86-32
1581
    ; splat function arguments
1582
    SPLATB_REG       m0, E_reg, %1, m7 ; E
1583
    SPLATB_REG       m1, I_reg, %1, m7 ; I
1584
    SPLATB_REG       m2, hev_thr_reg, %1, m7 ; hev_thresh
1585

    
1586
    ; align stack
1587
    mov       stack_reg, rsp         ; backup stack pointer
1588
    and             rsp, ~(mmsize-1) ; align stack
1589
%ifidn %2, v
1590
    sub             rsp, mmsize * 4  ; stack layout: [0]=E, [1]=I, [2]=hev_thr
1591
                                     ;               [3]=hev() result
1592
%else ; h
1593
    sub             rsp, mmsize * 5  ; extra storage space for transposes
1594
%endif
1595

    
1596
%define flim_E   [rsp]
1597
%define flim_I   [rsp+mmsize]
1598
%define hev_thr  [rsp+mmsize*2]
1599
%define mask_res [rsp+mmsize*3]
1600
%define p0backup [rsp+mmsize*3]
1601
%define q0backup [rsp+mmsize*4]
1602

    
1603
    mova         flim_E, m0
1604
    mova         flim_I, m1
1605
    mova        hev_thr, m2
1606

    
1607
%else ; sse2 on x86-64
1608

    
1609
%define flim_E   m9
1610
%define flim_I   m10
1611
%define hev_thr  m11
1612
%define mask_res m12
1613
%define p0backup m12
1614
%define q0backup m8
1615

    
1616
    ; splat function arguments
1617
    SPLATB_REG   flim_E, E_reg, %1, m7 ; E
1618
    SPLATB_REG   flim_I, I_reg, %1, m7 ; I
1619
    SPLATB_REG  hev_thr, hev_thr_reg, %1, m7 ; hev_thresh
1620
%endif
1621

    
1622
%if mmsize == 8 && %4 == 16 ; mmx/mmxext
1623
    mov         cnt_reg, 2
1624
%endif
1625
    mov      stride_reg, mstride_reg
1626
    neg     mstride_reg
1627
%ifidn %2, h
1628
    lea         dst_reg, [dst_reg + stride_reg*4-4]
1629
%if %4 == 8
1630
    lea        dst8_reg, [dst8_reg+ stride_reg*4-4]
1631
%endif
1632
%endif
1633

    
1634
%if mmsize == 8
1635
.next8px
1636
%endif
1637
    ; read
1638
    lea        dst2_reg, [dst_reg + stride_reg]
1639
%ifidn %2, v
1640
%if %4 == 8 && mmsize == 16
1641
%define movrow movh
1642
%else
1643
%define movrow mova
1644
%endif
1645
    movrow           m0, [dst_reg +mstride_reg*4] ; p3
1646
    movrow           m1, [dst2_reg+mstride_reg*4] ; p2
1647
    movrow           m2, [dst_reg +mstride_reg*2] ; p1
1648
    movrow           m5, [dst2_reg]               ; q1
1649
    movrow           m6, [dst2_reg+ stride_reg]   ; q2
1650
    movrow           m7, [dst2_reg+ stride_reg*2] ; q3
1651
%if mmsize == 16 && %4 == 8
1652
    movhps           m0, [dst8_reg+mstride_reg*4]
1653
    movhps           m2, [dst8_reg+mstride_reg*2]
1654
    add        dst8_reg, stride_reg
1655
    movhps           m1, [dst8_reg+mstride_reg*4]
1656
    movhps           m5, [dst8_reg]
1657
    movhps           m6, [dst8_reg+ stride_reg]
1658
    movhps           m7, [dst8_reg+ stride_reg*2]
1659
    add        dst8_reg, mstride_reg
1660
%endif
1661
%elif mmsize == 8 ; mmx/mmxext (h)
1662
    ; read 8 rows of 8px each
1663
    movu             m0, [dst_reg +mstride_reg*4]
1664
    movu             m1, [dst2_reg+mstride_reg*4]
1665
    movu             m2, [dst_reg +mstride_reg*2]
1666
    movu             m3, [dst_reg +mstride_reg]
1667
    movu             m4, [dst_reg]
1668
    movu             m5, [dst2_reg]
1669
    movu             m6, [dst2_reg+ stride_reg]
1670

    
1671
    ; 8x8 transpose
1672
    TRANSPOSE4x4B     0, 1, 2, 3, 7
1673
    mova       q0backup, m1
1674
    movu             m7, [dst2_reg+ stride_reg*2]
1675
    TRANSPOSE4x4B     4, 5, 6, 7, 1
1676
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
1677
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
1678
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1679
    mova             m1, q0backup
1680
    mova       q0backup, m2          ; store q0
1681
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1682
    mova       p0backup, m5          ; store p0
1683
    SWAP              1, 4
1684
    SWAP              2, 4
1685
    SWAP              6, 3
1686
    SWAP              5, 3
1687
%else ; sse2 (h)
1688
%if %4 == 16
1689
    lea        dst8_reg, [dst_reg + stride_reg*8]
1690
%endif
1691

    
1692
    ; read 16 rows of 8px each, interleave
1693
    movh             m0, [dst_reg +mstride_reg*4]
1694
    movh             m1, [dst8_reg+mstride_reg*4]
1695
    movh             m2, [dst_reg +mstride_reg*2]
1696
    movh             m5, [dst8_reg+mstride_reg*2]
1697
    movh             m3, [dst_reg +mstride_reg]
1698
    movh             m6, [dst8_reg+mstride_reg]
1699
    movh             m4, [dst_reg]
1700
    movh             m7, [dst8_reg]
1701
    punpcklbw        m0, m1          ; A/I
1702
    punpcklbw        m2, m5          ; C/K
1703
    punpcklbw        m3, m6          ; D/L
1704
    punpcklbw        m4, m7          ; E/M
1705

    
1706
    add        dst8_reg, stride_reg
1707
    movh             m1, [dst2_reg+mstride_reg*4]
1708
    movh             m6, [dst8_reg+mstride_reg*4]
1709
    movh             m5, [dst2_reg]
1710
    movh             m7, [dst8_reg]
1711
    punpcklbw        m1, m6          ; B/J
1712
    punpcklbw        m5, m7          ; F/N
1713
    movh             m6, [dst2_reg+ stride_reg]
1714
    movh             m7, [dst8_reg+ stride_reg]
1715
    punpcklbw        m6, m7          ; G/O
1716

    
1717
    ; 8x16 transpose
1718
    TRANSPOSE4x4B     0, 1, 2, 3, 7
1719
%ifdef m8
1720
    SWAP              1, 8
1721
%else
1722
    mova       q0backup, m1
1723
%endif
1724
    movh             m7, [dst2_reg+ stride_reg*2]
1725
    movh             m1, [dst8_reg+ stride_reg*2]
1726
    punpcklbw        m7, m1          ; H/P
1727
    TRANSPOSE4x4B     4, 5, 6, 7, 1
1728
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
1729
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
1730
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1731
%ifdef m8
1732
    SWAP              1, 8
1733
    SWAP              2, 8
1734
%else
1735
    mova             m1, q0backup
1736
    mova       q0backup, m2          ; store q0
1737
%endif
1738
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1739
%ifdef m12
1740
    SWAP              5, 12
1741
%else
1742
    mova       p0backup, m5          ; store p0
1743
%endif
1744
    SWAP              1, 4
1745
    SWAP              2, 4
1746
    SWAP              6, 3
1747
    SWAP              5, 3
1748
%endif
1749

    
1750
    ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
1751
    mova             m4, m1
1752
    SWAP              4, 1
1753
    psubusb          m4, m0          ; p2-p3
1754
    psubusb          m0, m1          ; p3-p2
1755
    por              m0, m4          ; abs(p3-p2)
1756

    
1757
    mova             m4, m2
1758
    SWAP              4, 2
1759
    psubusb          m4, m1          ; p1-p2
1760
    psubusb          m1, m2          ; p2-p1
1761
    por              m1, m4          ; abs(p2-p1)
1762

    
1763
    mova             m4, m6
1764
    SWAP              4, 6
1765
    psubusb          m4, m7          ; q2-q3
1766
    psubusb          m7, m6          ; q3-q2
1767
    por              m7, m4          ; abs(q3-q2)
1768

    
1769
    mova             m4, m5
1770
    SWAP              4, 5
1771
    psubusb          m4, m6          ; q1-q2
1772
    psubusb          m6, m5          ; q2-q1
1773
    por              m6, m4          ; abs(q2-q1)
1774

    
1775
%ifidn %1, mmx
1776
    mova             m4, flim_I
1777
    pxor             m3, m3
1778
    psubusb          m0, m4
1779
    psubusb          m1, m4
1780
    psubusb          m7, m4
1781
    psubusb          m6, m4
1782
    pcmpeqb          m0, m3          ; abs(p3-p2) <= I
1783
    pcmpeqb          m1, m3          ; abs(p2-p1) <= I
1784
    pcmpeqb          m7, m3          ; abs(q3-q2) <= I
1785
    pcmpeqb          m6, m3          ; abs(q2-q1) <= I
1786
    pand             m0, m1
1787
    pand             m7, m6
1788
    pand             m0, m7
1789
%else ; mmxext/sse2
1790
    pmaxub           m0, m1
1791
    pmaxub           m6, m7
1792
    pmaxub           m0, m6
1793
%endif
1794

    
1795
    ; normal_limit and high_edge_variance for p1-p0, q1-q0
1796
    SWAP              7, 3           ; now m7 is zero
1797
%ifidn %2, v
1798
    movrow           m3, [dst_reg +mstride_reg] ; p0
1799
%if mmsize == 16 && %4 == 8
1800
    movhps           m3, [dst8_reg+mstride_reg]
1801
%endif
1802
%elifdef m12
1803
    SWAP              3, 12
1804
%else
1805
    mova             m3, p0backup
1806
%endif
1807

    
1808
    mova             m1, m2
1809
    SWAP              1, 2
1810
    mova             m6, m3
1811
    SWAP              3, 6
1812
    psubusb          m1, m3          ; p1-p0
1813
    psubusb          m6, m2          ; p0-p1
1814
    por              m1, m6          ; abs(p1-p0)
1815
%ifidn %1, mmx
1816
    mova             m6, m1
1817
    psubusb          m1, m4
1818
    psubusb          m6, hev_thr
1819
    pcmpeqb          m1, m7          ; abs(p1-p0) <= I
1820
    pcmpeqb          m6, m7          ; abs(p1-p0) <= hev_thresh
1821
    pand             m0, m1
1822
    mova       mask_res, m6
1823
%else ; mmxext/sse2
1824
    pmaxub           m0, m1          ; max_I
1825
    SWAP              1, 4           ; max_hev_thresh
1826
%endif
1827

    
1828
    SWAP              6, 4           ; now m6 is I
1829
%ifidn %2, v
1830
    movrow           m4, [dst_reg]   ; q0
1831
%if mmsize == 16 && %4 == 8
1832
    movhps           m4, [dst8_reg]
1833
%endif
1834
%elifdef m8
1835
    SWAP              4, 8
1836
%else
1837
    mova             m4, q0backup
1838
%endif
1839
    mova             m1, m4
1840
    SWAP              1, 4
1841
    mova             m7, m5
1842
    SWAP              7, 5
1843
    psubusb          m1, m5          ; q0-q1
1844
    psubusb          m7, m4          ; q1-q0
1845
    por              m1, m7          ; abs(q1-q0)
1846
%ifidn %1, mmx
1847
    mova             m7, m1
1848
    psubusb          m1, m6
1849
    psubusb          m7, hev_thr
1850
    pxor             m6, m6
1851
    pcmpeqb          m1, m6          ; abs(q1-q0) <= I
1852
    pcmpeqb          m7, m6          ; abs(q1-q0) <= hev_thresh
1853
    mova             m6, mask_res
1854
    pand             m0, m1          ; abs([pq][321]-[pq][210]) <= I
1855
    pand             m6, m7
1856
%else ; mmxext/sse2
1857
    pxor             m7, m7
1858
    pmaxub           m0, m1
1859
    pmaxub           m6, m1
1860
    psubusb          m0, flim_I
1861
    psubusb          m6, hev_thr
1862
    pcmpeqb          m0, m7          ; max(abs(..)) <= I
1863
    pcmpeqb          m6, m7          ; !(max(abs..) > thresh)
1864
%endif
1865
%ifdef m12
1866
    SWAP              6, 12
1867
%else
1868
    mova       mask_res, m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1869
%endif
1870

    
1871
    ; simple_limit
1872
    mova             m1, m3
1873
    SWAP              1, 3
1874
    mova             m6, m4          ; keep copies of p0/q0 around for later use
1875
    SWAP              6, 4
1876
    psubusb          m1, m4          ; p0-q0
1877
    psubusb          m6, m3          ; q0-p0
1878
    por              m1, m6          ; abs(q0-p0)
1879
    paddusb          m1, m1          ; m1=2*abs(q0-p0)
1880

    
1881
    mova             m7, m2
1882
    SWAP              7, 2
1883
    mova             m6, m5
1884
    SWAP              6, 5
1885
    psubusb          m7, m5          ; p1-q1
1886
    psubusb          m6, m2          ; q1-p1
1887
    por              m7, m6          ; abs(q1-p1)
1888
    pxor             m6, m6
1889
    pand             m7, [pb_FE]
1890
    psrlq            m7, 1           ; abs(q1-p1)/2
1891
    paddusb          m7, m1          ; abs(q0-p0)*2+abs(q1-p1)/2
1892
    psubusb          m7, flim_E
1893
    pcmpeqb          m7, m6          ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
1894
    pand             m0, m7          ; normal_limit result
1895

    
1896
    ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
1897
%ifdef m8 ; x86-64 && sse2
1898
    mova             m8, [pb_80]
1899
%define pb_80_var m8
1900
%else ; x86-32 or mmx/mmxext
1901
%define pb_80_var [pb_80]
1902
%endif
1903
    mova             m1, m4
1904
    mova             m7, m3
1905
    pxor             m1, pb_80_var
1906
    pxor             m7, pb_80_var
1907
    psubsb           m1, m7          ; (signed) q0-p0
1908
    mova             m6, m2
1909
    mova             m7, m5
1910
    pxor             m6, pb_80_var
1911
    pxor             m7, pb_80_var
1912
    psubsb           m6, m7          ; (signed) p1-q1
1913
    mova             m7, mask_res
1914
    pandn            m7, m6
1915
    paddsb           m7, m1
1916
    paddsb           m7, m1
1917
    paddsb           m7, m1          ; 3*(q0-p0)+is4tap?(p1-q1)
1918

    
1919
    pand             m7, m0
1920
    mova             m1, [pb_F8]
1921
    mova             m6, m7
1922
    paddsb           m7, [pb_3]
1923
    paddsb           m6, [pb_4]
1924
    pand             m7, m1
1925
    pand             m6, m1
1926

    
1927
    pxor             m1, m1
1928
    pxor             m0, m0
1929
    pcmpgtb          m1, m7
1930
    psubb            m0, m7
1931
    psrlq            m7, 3           ; +f2
1932
    psrlq            m0, 3           ; -f2
1933
    pand             m0, m1
1934
    pandn            m1, m7
1935
    psubusb          m3, m0
1936
    paddusb          m3, m1          ; p0+f2
1937

    
1938
    pxor             m1, m1
1939
    pxor             m0, m0
1940
    pcmpgtb          m0, m6
1941
    psubb            m1, m6
1942
    psrlq            m6, 3           ; +f1
1943
    psrlq            m1, 3           ; -f1
1944
    pand             m1, m0
1945
    pandn            m0, m6
1946
    psubusb          m4, m0
1947
    paddusb          m4, m1          ; q0-f1
1948

    
1949
%ifdef m12
1950
    SWAP              6, 12
1951
%else
1952
    mova             m6, mask_res
1953
%endif
1954
%ifidn %1, mmx
1955
    mova             m7, [pb_1]
1956
%else ; mmxext/sse2
1957
    pxor             m7, m7
1958
%endif
1959
    pand             m0, m6
1960
    pand             m1, m6
1961
%ifidn %1, mmx
1962
    paddusb          m0, m7
1963
    pand             m1, [pb_FE]
1964
    pandn            m7, m0
1965
    psrlq            m1, 1
1966
    psrlq            m7, 1
1967
    SWAP              0, 7
1968
%else ; mmxext/sse2
1969
    psubusb          m1, [pb_1]
1970
    pavgb            m0, m7          ; a
1971
    pavgb            m1, m7          ; -a
1972
%endif
1973
    psubusb          m5, m0
1974
    psubusb          m2, m1
1975
    paddusb          m5, m1          ; q1-a
1976
    paddusb          m2, m0          ; p1+a
1977

    
1978
    ; store
1979
%ifidn %2, v
1980
    movrow [dst_reg +mstride_reg*2], m2
1981
    movrow [dst_reg +mstride_reg  ], m3
1982
    movrow    [dst_reg], m4
1983
    movrow [dst_reg + stride_reg  ], m5
1984
%if mmsize == 16 && %4 == 8
1985
    movhps [dst8_reg+mstride_reg*2], m2
1986
    movhps [dst8_reg+mstride_reg  ], m3
1987
    movhps   [dst8_reg], m4
1988
    movhps [dst8_reg+ stride_reg  ], m5
1989
%endif
1990
%else ; h
1991
    add         dst_reg, 2
1992
    add        dst2_reg, 2
1993

    
1994
    ; 4x8/16 transpose
1995
    TRANSPOSE4x4B     2, 3, 4, 5, 6
1996

    
1997
%if mmsize == 8 ; mmx/mmxext (h)
1998
    WRITE_4x2D        2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
1999
%else ; sse2 (h)
2000
    lea        dst8_reg, [dst8_reg+mstride_reg+2]
2001
    WRITE_4x4D        2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
2002
%endif
2003
%endif
2004

    
2005
%if mmsize == 8
2006
%if %4 == 8 ; chroma
2007
%ifidn %2, h
2008
    sub         dst_reg, 2
2009
%endif
2010
    cmp         dst_reg, dst8_reg
2011
    mov         dst_reg, dst8_reg
2012
    jnz .next8px
2013
%else
2014
%ifidn %2, h
2015
    lea         dst_reg, [dst_reg + stride_reg*8-2]
2016
%else ; v
2017
    add         dst_reg, 8
2018
%endif
2019
    dec         cnt_reg
2020
    jg .next8px
2021
%endif
2022
%endif
2023

    
2024
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
2025
    mov             rsp, stack_reg   ; restore stack pointer
2026
%endif
2027
    RET
2028
%endmacro
2029

    
2030
INIT_MMX
2031
INNER_LOOPFILTER mmx,    v, 6, 16, 0
2032
INNER_LOOPFILTER mmx,    h, 6, 16, 0
2033
INNER_LOOPFILTER mmxext, v, 6, 16, 0
2034
INNER_LOOPFILTER mmxext, h, 6, 16, 0
2035

    
2036
INNER_LOOPFILTER mmx,    v, 6,  8, 0
2037
INNER_LOOPFILTER mmx,    h, 6,  8, 0
2038
INNER_LOOPFILTER mmxext, v, 6,  8, 0
2039
INNER_LOOPFILTER mmxext, h, 6,  8, 0
2040

    
2041
INIT_XMM
2042
INNER_LOOPFILTER sse2,   v, 5, 16, 13
2043
%ifdef m8
2044
INNER_LOOPFILTER sse2,   h, 5, 16, 13
2045
%else
2046
INNER_LOOPFILTER sse2,   h, 6, 16, 13
2047
%endif
2048
INNER_LOOPFILTER sse2,   v, 6,  8, 13
2049
INNER_LOOPFILTER sse2,   h, 6,  8, 13
2050

    
2051
INNER_LOOPFILTER ssse3,  v, 5, 16, 13
2052
%ifdef m8
2053
INNER_LOOPFILTER ssse3,  h, 5, 16, 13
2054
%else
2055
INNER_LOOPFILTER ssse3,  h, 6, 16, 13
2056
%endif
2057
INNER_LOOPFILTER ssse3,  v, 6,  8, 13
2058
INNER_LOOPFILTER ssse3,  h, 6,  8, 13
2059

    
2060
;-----------------------------------------------------------------------------
2061
; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
2062
;                                            int flimE, int flimI, int hev_thr);
2063
;-----------------------------------------------------------------------------
2064

    
2065
; write 4 or 8 words in the mmx/xmm registers as 8 lines
2066
; 1 and 2 are the registers to write, this can be the same (for SSE2)
2067
; for pre-SSE4:
2068
; 3 is a general-purpose register that we will clobber
2069
; for SSE4:
2070
; 3 is a pointer to the destination's 5th line
2071
; 4 is a pointer to the destination's 4th line
2072
; 5/6 is -stride and +stride
2073
; 7 is optimization string
2074
%macro WRITE_8W 7
2075
%ifidn %7, sse4
2076
    pextrw    [%4+%5*4], %1, 0
2077
    pextrw    [%3+%5*4], %1, 1
2078
    pextrw    [%4+%5*2], %1, 2
2079
    pextrw    [%4+%5  ], %1, 3
2080
    pextrw    [%4     ], %1, 4
2081
    pextrw    [%3     ], %1, 5
2082
    pextrw    [%3+%6  ], %1, 6
2083
    pextrw    [%3+%6*2], %1, 7
2084
%else
2085
    movd             %3, %1
2086
%if mmsize == 8
2087
    punpckhdq        %1, %1
2088
%else
2089
    psrldq           %1, 4
2090
%endif
2091
    mov       [%4+%5*4], %3w
2092
    shr              %3, 16
2093
    add              %4, %6
2094
    mov       [%4+%5*4], %3w
2095

    
2096
    movd             %3, %1
2097
%if mmsize == 16
2098
    psrldq           %1, 4
2099
%endif
2100
    add              %4, %5
2101
    mov       [%4+%5*2], %3w
2102
    shr              %3, 16
2103
    mov       [%4+%5  ], %3w
2104

    
2105
    movd             %3, %2
2106
%if mmsize == 8
2107
    punpckhdq        %2, %2
2108
%else
2109
    psrldq           %2, 4
2110
%endif
2111
    mov       [%4     ], %3w
2112
    shr              %3, 16
2113
    mov       [%4+%6  ], %3w
2114

    
2115
    movd             %3, %2
2116
    add              %4, %6
2117
    mov       [%4+%6  ], %3w
2118
    shr              %3, 16
2119
    mov       [%4+%6*2], %3w
2120
%if mmsize == 8
2121
    add              %4, %5
2122
%endif
2123
%endif
2124
%endmacro
2125

    
2126
%macro MBEDGE_LOOPFILTER 5
2127
%if %4 == 8 ; chroma
2128
cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5
2129
%define dst8_reg    r1
2130
%define mstride_reg r2
2131
%define E_reg       r3
2132
%define I_reg       r4
2133
%define hev_thr_reg r5
2134
%else ; luma
2135
cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
2136
%define mstride_reg r1
2137
%define E_reg       r2
2138
%define I_reg       r3
2139
%define hev_thr_reg r4
2140
%ifdef m8 ; x86-64, sse2
2141
%define dst8_reg    r4
2142
%elif mmsize == 16 ; x86-32, sse2
2143
%define dst8_reg    r5
2144
%else ; x86-32, mmx/mmxext
2145
%define cnt_reg     r5
2146
%endif
2147
%endif
2148
%define dst_reg     r0
2149
%define stride_reg  E_reg
2150
%define dst2_reg    I_reg
2151
%ifndef m8
2152
%define stack_reg   hev_thr_reg
2153
%endif
2154

    
2155
%ifidn %1, ssse3
2156
    pxor             m7, m7
2157
%endif
2158

    
2159
%ifndef m8 ; mmx/mmxext or sse2 on x86-32
2160
    ; splat function arguments
2161
    SPLATB_REG       m0, E_reg, %1, m7 ; E
2162
    SPLATB_REG       m1, I_reg, %1, m7 ; I
2163
    SPLATB_REG       m2, hev_thr_reg, %1, m7 ; hev_thresh
2164

    
2165
    ; align stack
2166
    mov       stack_reg, rsp         ; backup stack pointer
2167
    and             rsp, ~(mmsize-1) ; align stack
2168
    sub             rsp, mmsize * 8  ; stack layout: [0]=E, [1]=I, [2]=hev_thr
2169
                                     ;               [3]=hev() result
2170
                                     ;               [4]=filter tmp result
2171
                                     ;               [5]/[6] = p2/q2 backup
2172
                                     ;               [7]=lim_res sign result
2173

    
2174
%define flim_E   [rsp]
2175
%define flim_I   [rsp+mmsize]
2176
%define hev_thr  [rsp+mmsize*2]
2177
%define mask_res [rsp+mmsize*3]
2178
%define lim_res  [rsp+mmsize*4]
2179
%define p0backup [rsp+mmsize*3]
2180
%define q0backup [rsp+mmsize*4]
2181
%define p2backup [rsp+mmsize*5]
2182
%define q2backup [rsp+mmsize*6]
2183
%define lim_sign [rsp+mmsize*7]
2184

    
2185
    mova         flim_E, m0
2186
    mova         flim_I, m1
2187
    mova        hev_thr, m2
2188

    
2189
%else ; sse2 on x86-64
2190

    
2191
%define flim_E   m9
2192
%define flim_I   m10
2193
%define hev_thr  m11
2194
%define mask_res m12
2195
%define lim_res  m8
2196
%define p0backup m12
2197
%define q0backup m8
2198
%define p2backup m13
2199
%define q2backup m14
2200
%define lim_sign m15
2201

    
2202
    ; splat function arguments
2203
    SPLATB_REG   flim_E, E_reg, %1, m7 ; E
2204
    SPLATB_REG   flim_I, I_reg, %1, m7 ; I
2205
    SPLATB_REG  hev_thr, hev_thr_reg, %1, m7 ; hev_thresh
2206
%endif
2207

    
2208
%if mmsize == 8 && %4 == 16 ; mmx/mmxext
2209
    mov         cnt_reg, 2
2210
%endif
2211
    mov      stride_reg, mstride_reg
2212
    neg     mstride_reg
2213
%ifidn %2, h
2214
    lea         dst_reg, [dst_reg + stride_reg*4-4]
2215
%if %4 == 8
2216
    lea        dst8_reg, [dst8_reg+ stride_reg*4-4]
2217
%endif
2218
%endif
2219

    
2220
%if mmsize == 8
2221
.next8px
2222
%endif
2223
    ; read
2224
    lea        dst2_reg, [dst_reg + stride_reg]
2225
%ifidn %2, v
2226
%if %4 == 8 && mmsize == 16
2227
%define movrow movh
2228
%else
2229
%define movrow mova
2230
%endif
2231
    movrow           m0, [dst_reg +mstride_reg*4] ; p3
2232
    movrow           m1, [dst2_reg+mstride_reg*4] ; p2
2233
    movrow           m2, [dst_reg +mstride_reg*2] ; p1
2234
    movrow           m5, [dst2_reg]               ; q1
2235
    movrow           m6, [dst2_reg+ stride_reg]   ; q2
2236
    movrow           m7, [dst2_reg+ stride_reg*2] ; q3
2237
%if mmsize == 16 && %4 == 8
2238
    movhps           m0, [dst8_reg+mstride_reg*4]
2239
    movhps           m2, [dst8_reg+mstride_reg*2]
2240
    add        dst8_reg, stride_reg
2241
    movhps           m1, [dst8_reg+mstride_reg*4]
2242
    movhps           m5, [dst8_reg]
2243
    movhps           m6, [dst8_reg+ stride_reg]
2244
    movhps           m7, [dst8_reg+ stride_reg*2]
2245
    add        dst8_reg, mstride_reg
2246
%endif
2247
%elif mmsize == 8 ; mmx/mmxext (h)
2248
    ; read 8 rows of 8px each
2249
    movu             m0, [dst_reg +mstride_reg*4]
2250
    movu             m1, [dst2_reg+mstride_reg*4]
2251
    movu             m2, [dst_reg +mstride_reg*2]
2252
    movu             m3, [dst_reg +mstride_reg]
2253
    movu             m4, [dst_reg]
2254
    movu             m5, [dst2_reg]
2255
    movu             m6, [dst2_reg+ stride_reg]
2256

    
2257
    ; 8x8 transpose
2258
    TRANSPOSE4x4B     0, 1, 2, 3, 7
2259
    mova       q0backup, m1
2260
    movu             m7, [dst2_reg+ stride_reg*2]
2261
    TRANSPOSE4x4B     4, 5, 6, 7, 1
2262
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
2263
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
2264
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
2265
    mova             m1, q0backup
2266
    mova       q0backup, m2          ; store q0
2267
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
2268
    mova       p0backup, m5          ; store p0
2269
    SWAP              1, 4
2270
    SWAP              2, 4
2271
    SWAP              6, 3
2272
    SWAP              5, 3
2273
%else ; sse2 (h)
2274
%if %4 == 16
2275
    lea        dst8_reg, [dst_reg + stride_reg*8]
2276
%endif
2277

    
2278
    ; read 16 rows of 8px each, interleave
2279
    movh             m0, [dst_reg +mstride_reg*4]
2280
    movh             m1, [dst8_reg+mstride_reg*4]
2281
    movh             m2, [dst_reg +mstride_reg*2]
2282
    movh             m5, [dst8_reg+mstride_reg*2]
2283
    movh             m3, [dst_reg +mstride_reg]
2284
    movh             m6, [dst8_reg+mstride_reg]
2285
    movh             m4, [dst_reg]
2286
    movh             m7, [dst8_reg]
2287
    punpcklbw        m0, m1          ; A/I
2288
    punpcklbw        m2, m5          ; C/K
2289
    punpcklbw        m3, m6          ; D/L
2290
    punpcklbw        m4, m7          ; E/M
2291

    
2292
    add        dst8_reg, stride_reg
2293
    movh             m1, [dst2_reg+mstride_reg*4]
2294
    movh             m6, [dst8_reg+mstride_reg*4]
2295
    movh             m5, [dst2_reg]
2296
    movh             m7, [dst8_reg]
2297
    punpcklbw        m1, m6          ; B/J
2298
    punpcklbw        m5, m7          ; F/N
2299
    movh             m6, [dst2_reg+ stride_reg]
2300
    movh             m7, [dst8_reg+ stride_reg]
2301
    punpcklbw        m6, m7          ; G/O
2302

    
2303
    ; 8x16 transpose
2304
    TRANSPOSE4x4B     0, 1, 2, 3, 7
2305
%ifdef m8
2306
    SWAP              1, 8
2307
%else
2308
    mova       q0backup, m1
2309
%endif
2310
    movh             m7, [dst2_reg+ stride_reg*2]
2311
    movh             m1, [dst8_reg+ stride_reg*2]
2312
    punpcklbw        m7, m1          ; H/P
2313
    TRANSPOSE4x4B     4, 5, 6, 7, 1
2314
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
2315
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
2316
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
2317
%ifdef m8
2318
    SWAP              1, 8
2319
    SWAP              2, 8
2320
%else
2321
    mova             m1, q0backup
2322
    mova       q0backup, m2          ; store q0
2323
%endif
2324
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
2325
%ifdef m12
2326
    SWAP              5, 12
2327
%else
2328
    mova       p0backup, m5          ; store p0
2329
%endif
2330
    SWAP              1, 4
2331
    SWAP              2, 4
2332
    SWAP              6, 3
2333
    SWAP              5, 3
2334
%endif
2335

    
2336
    ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
2337
    mova             m4, m1
2338
    SWAP              4, 1
2339
    psubusb          m4, m0          ; p2-p3
2340
    psubusb          m0, m1          ; p3-p2
2341
    por              m0, m4          ; abs(p3-p2)
2342

    
2343
    mova             m4, m2
2344
    SWAP              4, 2
2345
    psubusb          m4, m1          ; p1-p2
2346
    mova       p2backup, m1
2347
    psubusb          m1, m2          ; p2-p1
2348
    por              m1, m4          ; abs(p2-p1)
2349

    
2350
    mova             m4, m6
2351
    SWAP              4, 6
2352
    psubusb          m4, m7          ; q2-q3
2353
    psubusb          m7, m6          ; q3-q2
2354
    por              m7, m4          ; abs(q3-q2)
2355

    
2356
    mova             m4, m5
2357
    SWAP              4, 5
2358
    psubusb          m4, m6          ; q1-q2
2359
    mova       q2backup, m6
2360
    psubusb          m6, m5          ; q2-q1
2361
    por              m6, m4          ; abs(q2-q1)
2362

    
2363
%ifidn %1, mmx
2364
    mova             m4, flim_I
2365
    pxor             m3, m3
2366
    psubusb          m0, m4
2367
    psubusb          m1, m4
2368
    psubusb          m7, m4
2369
    psubusb          m6, m4
2370
    pcmpeqb          m0, m3          ; abs(p3-p2) <= I
2371
    pcmpeqb          m1, m3          ; abs(p2-p1) <= I
2372
    pcmpeqb          m7, m3          ; abs(q3-q2) <= I
2373
    pcmpeqb          m6, m3          ; abs(q2-q1) <= I
2374
    pand             m0, m1
2375
    pand             m7, m6
2376
    pand             m0, m7
2377
%else ; mmxext/sse2
2378
    pmaxub           m0, m1
2379
    pmaxub           m6, m7
2380
    pmaxub           m0, m6
2381
%endif
2382

    
2383
    ; normal_limit and high_edge_variance for p1-p0, q1-q0
2384
    SWAP              7, 3           ; now m7 is zero
2385
%ifidn %2, v
2386
    movrow           m3, [dst_reg +mstride_reg] ; p0
2387
%if mmsize == 16 && %4 == 8
2388
    movhps           m3, [dst8_reg+mstride_reg]
2389
%endif
2390
%elifdef m12
2391
    SWAP              3, 12
2392
%else
2393
    mova             m3, p0backup
2394
%endif
2395

    
2396
    mova             m1, m2
2397
    SWAP              1, 2
2398
    mova             m6, m3
2399
    SWAP              3, 6
2400
    psubusb          m1, m3          ; p1-p0
2401
    psubusb          m6, m2          ; p0-p1
2402
    por              m1, m6          ; abs(p1-p0)
2403
%ifidn %1, mmx
2404
    mova             m6, m1
2405
    psubusb          m1, m4
2406
    psubusb          m6, hev_thr
2407
    pcmpeqb          m1, m7          ; abs(p1-p0) <= I
2408
    pcmpeqb          m6, m7          ; abs(p1-p0) <= hev_thresh
2409
    pand             m0, m1
2410
    mova       mask_res, m6
2411
%else ; mmxext/sse2
2412
    pmaxub           m0, m1          ; max_I
2413
    SWAP              1, 4           ; max_hev_thresh
2414
%endif
2415

    
2416
    SWAP              6, 4           ; now m6 is I
2417
%ifidn %2, v
2418
    movrow           m4, [dst_reg]   ; q0
2419
%if mmsize == 16 && %4 == 8
2420
    movhps           m4, [dst8_reg]
2421
%endif
2422
%elifdef m8
2423
    SWAP              4, 8
2424
%else
2425
    mova             m4, q0backup
2426
%endif
2427
    mova             m1, m4
2428
    SWAP              1, 4
2429
    mova             m7, m5
2430
    SWAP              7, 5
2431
    psubusb          m1, m5          ; q0-q1
2432
    psubusb          m7, m4          ; q1-q0
2433
    por              m1, m7          ; abs(q1-q0)
2434
%ifidn %1, mmx
2435
    mova             m7, m1
2436
    psubusb          m1, m6
2437
    psubusb          m7, hev_thr
2438
    pxor             m6, m6
2439
    pcmpeqb          m1, m6          ; abs(q1-q0) <= I
2440
    pcmpeqb          m7, m6          ; abs(q1-q0) <= hev_thresh
2441
    mova             m6, mask_res
2442
    pand             m0, m1          ; abs([pq][321]-[pq][210]) <= I
2443
    pand             m6, m7
2444
%else ; mmxext/sse2
2445
    pxor             m7, m7
2446
    pmaxub           m0, m1
2447
    pmaxub           m6, m1
2448
    psubusb          m0, flim_I
2449
    psubusb          m6, hev_thr
2450
    pcmpeqb          m0, m7          ; max(abs(..)) <= I
2451
    pcmpeqb          m6, m7          ; !(max(abs..) > thresh)
2452
%endif
2453
%ifdef m12
2454
    SWAP              6, 12
2455
%else
2456
    mova       mask_res, m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
2457
%endif
2458

    
2459
    ; simple_limit
2460
    mova             m1, m3
2461
    SWAP              1, 3
2462
    mova             m6, m4          ; keep copies of p0/q0 around for later use
2463
    SWAP              6, 4
2464
    psubusb          m1, m4          ; p0-q0
2465
    psubusb          m6, m3          ; q0-p0
2466
    por              m1, m6          ; abs(q0-p0)
2467
    paddusb          m1, m1          ; m1=2*abs(q0-p0)
2468

    
2469
    mova             m7, m2
2470
    SWAP              7, 2
2471
    mova             m6, m5
2472
    SWAP              6, 5
2473
    psubusb          m7, m5          ; p1-q1
2474
    psubusb          m6, m2          ; q1-p1
2475
    por              m7, m6          ; abs(q1-p1)
2476
    pxor             m6, m6
2477
    pand             m7, [pb_FE]
2478
    psrlq            m7, 1           ; abs(q1-p1)/2
2479
    paddusb          m7, m1          ; abs(q0-p0)*2+abs(q1-p1)/2
2480
    psubusb          m7, flim_E
2481
    pcmpeqb          m7, m6          ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
2482
    pand             m0, m7          ; normal_limit result
2483

    
2484
    ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
2485
%ifdef m8 ; x86-64 && sse2
2486
    mova             m8, [pb_80]
2487
%define pb_80_var m8
2488
%else ; x86-32 or mmx/mmxext
2489
%define pb_80_var [pb_80]
2490
%endif
2491
    mova             m1, m4
2492
    mova             m7, m3
2493
    pxor             m1, pb_80_var
2494
    pxor             m7, pb_80_var
2495
    psubsb           m1, m7          ; (signed) q0-p0
2496
    mova             m6, m2
2497
    mova             m7, m5
2498
    pxor             m6, pb_80_var
2499
    pxor             m7, pb_80_var
2500
    psubsb           m6, m7          ; (signed) p1-q1
2501
    mova             m7, mask_res
2502
    paddsb           m6, m1
2503
    paddsb           m6, m1
2504
    paddsb           m6, m1
2505
    pand             m6, m0
2506
%ifdef m8
2507
    mova        lim_res, m6          ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
2508
    pand        lim_res, m7
2509
%else
2510
    mova             m0, m6
2511
    pand             m0, m7
2512
    mova        lim_res, m0
2513
%endif
2514
    pandn            m7, m6          ; 3*(q0-p0)+(p1-q1) masked for filter_common
2515

    
2516
    mova             m1, [pb_F8]
2517
    mova             m6, m7
2518
    paddsb           m7, [pb_3]
2519
    paddsb           m6, [pb_4]
2520
    pand             m7, m1
2521
    pand             m6, m1
2522

    
2523
    pxor             m1, m1
2524
    pxor             m0, m0
2525
    pcmpgtb          m1, m7
2526
    psubb            m0, m7
2527
    psrlq            m7, 3           ; +f2
2528
    psrlq            m0, 3           ; -f2
2529
    pand             m0, m1
2530
    pandn            m1, m7
2531
    psubusb          m3, m0
2532
    paddusb          m3, m1          ; p0+f2
2533

    
2534
    pxor             m1, m1
2535
    pxor             m0, m0
2536
    pcmpgtb          m0, m6
2537
    psubb            m1, m6
2538
    psrlq            m6, 3           ; +f1
2539
    psrlq            m1, 3           ; -f1
2540
    pand             m1, m0
2541
    pandn            m0, m6
2542
    psubusb          m4, m0
2543
    paddusb          m4, m1          ; q0-f1
2544

    
2545
    ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
2546
    mova             m7, [pw_63]
2547
%ifdef m8
2548
    SWAP              1, 8
2549
%else
2550
    mova             m1, lim_res
2551
%endif
2552
    pxor             m0, m0
2553
    mova             m6, m1
2554
    pcmpgtb          m0, m1         ; which are negative
2555
    punpcklbw        m6, m0         ; signed byte->word
2556
    punpckhbw        m1, m0
2557
    mova       lim_sign, m0
2558
    mova       mask_res, m6         ; backup for later in filter
2559
    mova        lim_res, m1
2560
    pmullw          m6, [pw_27]
2561
    pmullw          m1, [pw_27]
2562
    paddw           m6, m7
2563
    paddw           m1, m7
2564
    psraw           m6, 7
2565
    psraw           m1, 7
2566
    packsswb        m6, m1          ; a0
2567
    pxor            m1, m1
2568
    psubb           m1, m6
2569
    pand            m1, m0          ; -a0
2570
    pandn           m0, m6          ; +a0
2571
    psubusb         m3, m1
2572
    paddusb         m4, m1
2573
    paddusb         m3, m0          ; p0+a0
2574
    psubusb         m4, m0          ; q0-a0
2575

    
2576
    mova            m6, mask_res
2577
    mova            m1, lim_res
2578
    mova            m0, lim_sign
2579
    pmullw          m6, [pw_18]
2580
    pmullw          m1, [pw_18]
2581
    paddw           m6, m7
2582
    paddw           m1, m7
2583
    psraw           m6, 7
2584
    psraw           m1, 7
2585
    packsswb        m6, m1          ; a1
2586
    pxor            m1, m1
2587
    psubb           m1, m6
2588
    pand            m1, m0          ; -a1
2589
    pandn           m0, m6          ; +a1
2590
    psubusb         m2, m1
2591
    paddusb         m5, m1
2592
    paddusb         m2, m0          ; p1+a1
2593
    psubusb         m5, m0          ; q1-a1
2594

    
2595
%ifdef m8
2596
    SWAP             6, 12
2597
    SWAP             1, 8
2598
%else
2599
    mova            m6, mask_res
2600
    mova            m1, lim_res
2601
%endif
2602
    pmullw          m6, [pw_9]
2603
    pmullw          m1, [pw_9]
2604
    paddw           m6, m7
2605
    paddw           m1, m7
2606
%ifdef m15
2607
    SWAP             7, 15
2608
%else
2609
    mova            m7, lim_sign
2610
%endif
2611
    psraw           m6, 7
2612
    psraw           m1, 7
2613
    packsswb        m6, m1          ; a1
2614
    pxor            m0, m0
2615
    psubb           m0, m6
2616
    pand            m0, m7          ; -a1
2617
    pandn           m7, m6          ; +a1
2618
%ifdef m8
2619
    SWAP             1, 13
2620
    SWAP             6, 14
2621
%else
2622
    mova            m1, p2backup
2623
    mova            m6, q2backup
2624
%endif
2625
    psubusb         m1, m0
2626
    paddusb         m6, m0
2627
    paddusb         m1, m7          ; p1+a1
2628
    psubusb         m6, m7          ; q1-a1
2629

    
2630
    ; store
2631
%ifidn %2, v
2632
    movrow [dst2_reg+mstride_reg*4], m1
2633
    movrow [dst_reg +mstride_reg*2], m2
2634
    movrow [dst_reg +mstride_reg  ], m3
2635
    movrow    [dst_reg], m4
2636
    movrow   [dst2_reg], m5
2637
    movrow [dst2_reg+ stride_reg  ], m6
2638
%if mmsize == 16 && %4 == 8
2639
    add        dst8_reg, mstride_reg
2640
    movhps [dst8_reg+mstride_reg*2], m1
2641
    movhps [dst8_reg+mstride_reg  ], m2
2642
    movhps   [dst8_reg], m3
2643
    add        dst8_reg, stride_reg
2644
    movhps   [dst8_reg], m4
2645
    movhps [dst8_reg+ stride_reg  ], m5
2646
    movhps [dst8_reg+ stride_reg*2], m6
2647
%endif
2648
%else ; h
2649
    inc         dst_reg
2650
    inc        dst2_reg
2651

    
2652
    ; 4x8/16 transpose
2653
    TRANSPOSE4x4B     1, 2, 3, 4, 0
2654
    SBUTTERFLY       bw, 5, 6, 0
2655

    
2656
%if mmsize == 8 ; mmx/mmxext (h)
2657
    WRITE_4x2D        1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg
2658
    add         dst_reg, 4
2659
    WRITE_8W         m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg, %4
2660
%else ; sse2 (h)
2661
    lea        dst8_reg, [dst8_reg+mstride_reg+1]
2662
    WRITE_4x4D        1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
2663
    lea         dst_reg, [dst2_reg+mstride_reg+4]
2664
    lea        dst8_reg, [dst8_reg+mstride_reg+4]
2665
    WRITE_8W         m5, m5, dst2_reg, dst_reg,  mstride_reg, stride_reg, %2
2666
%ifidn %2, sse4
2667
    lea         dst_reg, [dst8_reg+ stride_reg]
2668
%endif
2669
    WRITE_8W         m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg, %2
2670
%endif
2671
%endif
2672

    
2673
%if mmsize == 8
2674
%if %4 == 8 ; chroma
2675
%ifidn %2, h
2676
    sub         dst_reg, 5
2677
%endif
2678
    cmp         dst_reg, dst8_reg
2679
    mov         dst_reg, dst8_reg
2680
    jnz .next8px
2681
%else
2682
%ifidn %2, h
2683
    lea         dst_reg, [dst_reg + stride_reg*8-5]
2684
%else ; v
2685
    add         dst_reg, 8
2686
%endif
2687
    dec         cnt_reg
2688
    jg .next8px
2689
%endif
2690
%endif
2691

    
2692
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
2693
    mov             rsp, stack_reg   ; restore stack pointer
2694
%endif
2695
    RET
2696
%endmacro
2697

    
2698
INIT_MMX
2699
MBEDGE_LOOPFILTER mmx,    v, 6, 16, 0
2700
MBEDGE_LOOPFILTER mmx,    h, 6, 16, 0
2701
MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0
2702
MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0
2703

    
2704
MBEDGE_LOOPFILTER mmx,    v, 6,  8, 0
2705
MBEDGE_LOOPFILTER mmx,    h, 6,  8, 0
2706
MBEDGE_LOOPFILTER mmxext, v, 6,  8, 0
2707
MBEDGE_LOOPFILTER mmxext, h, 6,  8, 0
2708

    
2709
INIT_XMM
2710
MBEDGE_LOOPFILTER sse2,   v, 5, 16, 16
2711
%ifdef m8
2712
MBEDGE_LOOPFILTER sse2,   h, 5, 16, 16
2713
%else
2714
MBEDGE_LOOPFILTER sse2,   h, 6, 16, 16
2715
%endif
2716
MBEDGE_LOOPFILTER sse2,   v, 6,  8, 16
2717
MBEDGE_LOOPFILTER sse2,   h, 6,  8, 16
2718

    
2719
MBEDGE_LOOPFILTER ssse3,  v, 5, 16, 16
2720
%ifdef m8
2721
MBEDGE_LOOPFILTER ssse3,  h, 5, 16, 16
2722
%else
2723
MBEDGE_LOOPFILTER ssse3,  h, 6, 16, 16
2724
%endif
2725
MBEDGE_LOOPFILTER ssse3,  v, 6,  8, 16
2726
MBEDGE_LOOPFILTER ssse3,  h, 6,  8, 16
2727

    
2728
%ifdef m8
2729
MBEDGE_LOOPFILTER sse4,   h, 5, 16, 16
2730
%else
2731
MBEDGE_LOOPFILTER sse4,   h, 6, 16, 16
2732
%endif
2733
MBEDGE_LOOPFILTER sse4,   h, 6,  8, 16