Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / vp8dsp.asm @ 888fa31e

History | View | Annotate | Download (78.4 KB)

1
;******************************************************************************
2
;* VP8 MMXEXT optimizations
3
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5
;*
6
;* This file is part of Libav.
7
;*
8
;* Libav is free software; you can redistribute it and/or
9
;* modify it under the terms of the GNU Lesser General Public
10
;* License as published by the Free Software Foundation; either
11
;* version 2.1 of the License, or (at your option) any later version.
12
;*
13
;* Libav is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
;* Lesser General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU Lesser General Public
19
;* License along with Libav; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
;******************************************************************************
22

    
23
%include "x86inc.asm"
24
%include "x86util.asm"
25

    
26
SECTION_RODATA
27

    
28
fourtap_filter_hw_m: times 4 dw  -6, 123
29
                     times 4 dw  12,  -1
30
                     times 4 dw  -9,  93
31
                     times 4 dw  50,  -6
32
                     times 4 dw  -6,  50
33
                     times 4 dw  93,  -9
34
                     times 4 dw  -1,  12
35
                     times 4 dw 123,  -6
36

    
37
sixtap_filter_hw_m:  times 4 dw   2, -11
38
                     times 4 dw 108,  36
39
                     times 4 dw  -8,   1
40
                     times 4 dw   3, -16
41
                     times 4 dw  77,  77
42
                     times 4 dw -16,   3
43
                     times 4 dw   1,  -8
44
                     times 4 dw  36, 108
45
                     times 4 dw -11,   2
46

    
47
fourtap_filter_hb_m: times 8 db  -6, 123
48
                     times 8 db  12,  -1
49
                     times 8 db  -9,  93
50
                     times 8 db  50,  -6
51
                     times 8 db  -6,  50
52
                     times 8 db  93,  -9
53
                     times 8 db  -1,  12
54
                     times 8 db 123,  -6
55

    
56
sixtap_filter_hb_m:  times 8 db   2,   1
57
                     times 8 db -11, 108
58
                     times 8 db  36,  -8
59
                     times 8 db   3,   3
60
                     times 8 db -16,  77
61
                     times 8 db  77, -16
62
                     times 8 db   1,   2
63
                     times 8 db  -8,  36
64
                     times 8 db 108, -11
65

    
66
fourtap_filter_v_m:  times 8 dw  -6
67
                     times 8 dw 123
68
                     times 8 dw  12
69
                     times 8 dw  -1
70
                     times 8 dw  -9
71
                     times 8 dw  93
72
                     times 8 dw  50
73
                     times 8 dw  -6
74
                     times 8 dw  -6
75
                     times 8 dw  50
76
                     times 8 dw  93
77
                     times 8 dw  -9
78
                     times 8 dw  -1
79
                     times 8 dw  12
80
                     times 8 dw 123
81
                     times 8 dw  -6
82

    
83
sixtap_filter_v_m:   times 8 dw   2
84
                     times 8 dw -11
85
                     times 8 dw 108
86
                     times 8 dw  36
87
                     times 8 dw  -8
88
                     times 8 dw   1
89
                     times 8 dw   3
90
                     times 8 dw -16
91
                     times 8 dw  77
92
                     times 8 dw  77
93
                     times 8 dw -16
94
                     times 8 dw   3
95
                     times 8 dw   1
96
                     times 8 dw  -8
97
                     times 8 dw  36
98
                     times 8 dw 108
99
                     times 8 dw -11
100
                     times 8 dw   2
101

    
102
bilinear_filter_vw_m: times 8 dw 1
103
                      times 8 dw 2
104
                      times 8 dw 3
105
                      times 8 dw 4
106
                      times 8 dw 5
107
                      times 8 dw 6
108
                      times 8 dw 7
109

    
110
bilinear_filter_vb_m: times 8 db 7, 1
111
                      times 8 db 6, 2
112
                      times 8 db 5, 3
113
                      times 8 db 4, 4
114
                      times 8 db 3, 5
115
                      times 8 db 2, 6
116
                      times 8 db 1, 7
117

    
118
%ifdef PIC
119
%define fourtap_filter_hw    r11
120
%define sixtap_filter_hw     r11
121
%define fourtap_filter_hb    r11
122
%define sixtap_filter_hb     r11
123
%define fourtap_filter_v     r11
124
%define sixtap_filter_v      r11
125
%define bilinear_filter_vw   r11
126
%define bilinear_filter_vb   r11
127
%else
128
%define fourtap_filter_hw fourtap_filter_hw_m
129
%define sixtap_filter_hw  sixtap_filter_hw_m
130
%define fourtap_filter_hb fourtap_filter_hb_m
131
%define sixtap_filter_hb  sixtap_filter_hb_m
132
%define fourtap_filter_v  fourtap_filter_v_m
133
%define sixtap_filter_v   sixtap_filter_v_m
134
%define bilinear_filter_vw bilinear_filter_vw_m
135
%define bilinear_filter_vb bilinear_filter_vb_m
136
%endif
137

    
138
filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
139
filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
140

    
141
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
142
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
143
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
144

    
145
pw_20091: times 4 dw 20091
146
pw_17734: times 4 dw 17734
147

    
148
pb_27_63: times 8 db 27, 63
149
pb_18_63: times 8 db 18, 63
150
pb_9_63:  times 8 db  9, 63
151

    
152
cextern pb_1
153
cextern pw_3
154
cextern pb_3
155
cextern pw_4
156
cextern pb_4
157
cextern pw_9
158
cextern pw_18
159
cextern pw_27
160
cextern pw_63
161
cextern pw_64
162
cextern pb_80
163
cextern pb_F8
164
cextern pb_FE
165

    
166
SECTION .text
167

    
168
;-----------------------------------------------------------------------------
169
; subpel MC functions:
170
;
171
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
172
;                                              uint8_t *src, int srcstride,
173
;                                              int height,   int mx, int my);
174
;-----------------------------------------------------------------------------
175

    
176
%macro FILTER_SSSE3 3
177
cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
178
    lea      r5d, [r5*3]
179
    mova      m3, [filter_h6_shuf2]
180
    mova      m4, [filter_h6_shuf3]
181
%ifdef PIC
182
    lea      r11, [sixtap_filter_hb_m]
183
%endif
184
    mova      m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
185
    mova      m6, [sixtap_filter_hb+r5*8-32]
186
    mova      m7, [sixtap_filter_hb+r5*8-16]
187

    
188
.nextrow
189
    movu      m0, [r2-2]
190
    mova      m1, m0
191
    mova      m2, m0
192
%ifidn %1, 4
193
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
194
; shuffle with a memory operand
195
    punpcklbw m0, [r2+3]
196
%else
197
    pshufb    m0, [filter_h6_shuf1]
198
%endif
199
    pshufb    m1, m3
200
    pshufb    m2, m4
201
    pmaddubsw m0, m5
202
    pmaddubsw m1, m6
203
    pmaddubsw m2, m7
204
    paddsw    m0, m1
205
    paddsw    m0, m2
206
    paddsw    m0, [pw_64]
207
    psraw     m0, 7
208
    packuswb  m0, m0
209
    movh    [r0], m0        ; store
210

    
211
    ; go to next line
212
    add       r0, r1
213
    add       r2, r3
214
    dec      r4d            ; next row
215
    jg .nextrow
216
    REP_RET
217

    
218
cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
219
    shl      r5d, 4
220
    mova      m2, [pw_64]
221
    mova      m3, [filter_h2_shuf]
222
    mova      m4, [filter_h4_shuf]
223
%ifdef PIC
224
    lea      r11, [fourtap_filter_hb_m]
225
%endif
226
    mova      m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
227
    mova      m6, [fourtap_filter_hb+r5]
228

    
229
.nextrow
230
    movu      m0, [r2-1]
231
    mova      m1, m0
232
    pshufb    m0, m3
233
    pshufb    m1, m4
234
    pmaddubsw m0, m5
235
    pmaddubsw m1, m6
236
    paddsw    m0, m2
237
    paddsw    m0, m1
238
    psraw     m0, 7
239
    packuswb  m0, m0
240
    movh    [r0], m0        ; store
241

    
242
    ; go to next line
243
    add       r0, r1
244
    add       r2, r3
245
    dec      r4d            ; next row
246
    jg .nextrow
247
    REP_RET
248

    
249
cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
250
    shl      r6d, 4
251
%ifdef PIC
252
    lea      r11, [fourtap_filter_hb_m]
253
%endif
254
    mova      m5, [fourtap_filter_hb+r6-16]
255
    mova      m6, [fourtap_filter_hb+r6]
256
    mova      m7, [pw_64]
257

    
258
    ; read 3 lines
259
    sub       r2, r3
260
    movh      m0, [r2]
261
    movh      m1, [r2+  r3]
262
    movh      m2, [r2+2*r3]
263
    add       r2, r3
264

    
265
.nextrow
266
    movh      m3, [r2+2*r3]                ; read new row
267
    mova      m4, m0
268
    mova      m0, m1
269
    punpcklbw m4, m1
270
    mova      m1, m2
271
    punpcklbw m2, m3
272
    pmaddubsw m4, m5
273
    pmaddubsw m2, m6
274
    paddsw    m4, m2
275
    mova      m2, m3
276
    paddsw    m4, m7
277
    psraw     m4, 7
278
    packuswb  m4, m4
279
    movh    [r0], m4
280

    
281
    ; go to next line
282
    add        r0, r1
283
    add        r2, r3
284
    dec       r4d                          ; next row
285
    jg .nextrow
286
    REP_RET
287

    
288
cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
289
    lea      r6d, [r6*3]
290
%ifdef PIC
291
    lea      r11, [sixtap_filter_hb_m]
292
%endif
293
    lea       r6, [sixtap_filter_hb+r6*8]
294

    
295
    ; read 5 lines
296
    sub       r2, r3
297
    sub       r2, r3
298
    movh      m0, [r2]
299
    movh      m1, [r2+r3]
300
    movh      m2, [r2+r3*2]
301
    lea       r2, [r2+r3*2]
302
    add       r2, r3
303
    movh      m3, [r2]
304
    movh      m4, [r2+r3]
305

    
306
.nextrow
307
    movh      m5, [r2+2*r3]                ; read new row
308
    mova      m6, m0
309
    punpcklbw m6, m5
310
    mova      m0, m1
311
    punpcklbw m1, m2
312
    mova      m7, m3
313
    punpcklbw m7, m4
314
    pmaddubsw m6, [r6-48]
315
    pmaddubsw m1, [r6-32]
316
    pmaddubsw m7, [r6-16]
317
    paddsw    m6, m1
318
    paddsw    m6, m7
319
    mova      m1, m2
320
    paddsw    m6, [pw_64]
321
    mova      m2, m3
322
    psraw     m6, 7
323
    mova      m3, m4
324
    packuswb  m6, m6
325
    mova      m4, m5
326
    movh    [r0], m6
327

    
328
    ; go to next line
329
    add        r0, r1
330
    add        r2, r3
331
    dec       r4d                          ; next row
332
    jg .nextrow
333
    REP_RET
334
%endmacro
335

    
336
INIT_MMX
337
FILTER_SSSE3 4, 0, 0
338
INIT_XMM
339
FILTER_SSSE3 8, 8, 7
340

    
341
; 4x4 block, H-only 4-tap filter
342
cglobal put_vp8_epel4_h4_mmxext, 6, 6
343
    shl       r5d, 4
344
%ifdef PIC
345
    lea       r11, [fourtap_filter_hw_m]
346
%endif
347
    movq      mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
348
    movq      mm5, [fourtap_filter_hw+r5]
349
    movq      mm7, [pw_64]
350
    pxor      mm6, mm6
351

    
352
.nextrow
353
    movq      mm1, [r2-1]                  ; (ABCDEFGH) load 8 horizontal pixels
354

    
355
    ; first set of 2 pixels
356
    movq      mm2, mm1                     ; byte ABCD..
357
    punpcklbw mm1, mm6                     ; byte->word ABCD
358
    pshufw    mm0, mm2, 9                  ; byte CDEF..
359
    punpcklbw mm0, mm6                     ; byte->word CDEF
360
    pshufw    mm3, mm1, 0x94               ; word ABBC
361
    pshufw    mm1, mm0, 0x94               ; word CDDE
362
    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
363
    movq      mm0, mm1                     ; backup for second set of pixels
364
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
365
    paddd     mm3, mm1                     ; finish 1st 2px
366

    
367
    ; second set of 2 pixels, use backup of above
368
    punpckhbw mm2, mm6                     ; byte->word EFGH
369
    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
370
    pshufw    mm1, mm2, 0x94               ; word EFFG
371
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
372
    paddd     mm0, mm1                     ; finish 2nd 2px
373

    
374
    ; merge two sets of 2 pixels into one set of 4, round/clip/store
375
    packssdw  mm3, mm0                     ; merge dword->word (4px)
376
    paddsw    mm3, mm7                     ; rounding
377
    psraw     mm3, 7
378
    packuswb  mm3, mm6                     ; clip and word->bytes
379
    movd     [r0], mm3                     ; store
380

    
381
    ; go to next line
382
    add        r0, r1
383
    add        r2, r3
384
    dec       r4d                          ; next row
385
    jg .nextrow
386
    REP_RET
387

    
388
; 4x4 block, H-only 6-tap filter
389
cglobal put_vp8_epel4_h6_mmxext, 6, 6
390
    lea       r5d, [r5*3]
391
%ifdef PIC
392
    lea       r11, [sixtap_filter_hw_m]
393
%endif
394
    movq      mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
395
    movq      mm5, [sixtap_filter_hw+r5*8-32]
396
    movq      mm6, [sixtap_filter_hw+r5*8-16]
397
    movq      mm7, [pw_64]
398
    pxor      mm3, mm3
399

    
400
.nextrow
401
    movq      mm1, [r2-2]                  ; (ABCDEFGH) load 8 horizontal pixels
402

    
403
    ; first set of 2 pixels
404
    movq      mm2, mm1                     ; byte ABCD..
405
    punpcklbw mm1, mm3                     ; byte->word ABCD
406
    pshufw    mm0, mm2, 0x9                ; byte CDEF..
407
    punpckhbw mm2, mm3                     ; byte->word EFGH
408
    punpcklbw mm0, mm3                     ; byte->word CDEF
409
    pshufw    mm1, mm1, 0x94               ; word ABBC
410
    pshufw    mm2, mm2, 0x94               ; word EFFG
411
    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
412
    pshufw    mm3, mm0, 0x94               ; word CDDE
413
    movq      mm0, mm3                     ; backup for second set of pixels
414
    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
415
    paddd     mm1, mm3                     ; add to 1st 2px cache
416
    movq      mm3, mm2                     ; backup for second set of pixels
417
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
418
    paddd     mm1, mm2                     ; finish 1st 2px
419

    
420
    ; second set of 2 pixels, use backup of above
421
    movd      mm2, [r2+3]                  ; byte FGHI (prevent overreads)
422
    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
423
    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
424
    paddd     mm0, mm3                     ; add to 2nd 2px cache
425
    pxor      mm3, mm3
426
    punpcklbw mm2, mm3                     ; byte->word FGHI
427
    pshufw    mm2, mm2, 0xE9               ; word GHHI
428
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
429
    paddd     mm0, mm2                     ; finish 2nd 2px
430

    
431
    ; merge two sets of 2 pixels into one set of 4, round/clip/store
432
    packssdw  mm1, mm0                     ; merge dword->word (4px)
433
    paddsw    mm1, mm7                     ; rounding
434
    psraw     mm1, 7
435
    packuswb  mm1, mm3                     ; clip and word->bytes
436
    movd     [r0], mm1                     ; store
437

    
438
    ; go to next line
439
    add        r0, r1
440
    add        r2, r3
441
    dec       r4d                          ; next row
442
    jg .nextrow
443
    REP_RET
444

    
445
INIT_XMM
446
cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
447
    shl      r5d, 5
448
%ifdef PIC
449
    lea      r11, [fourtap_filter_v_m]
450
%endif
451
    lea       r5, [fourtap_filter_v+r5-32]
452
    pxor      m7, m7
453
    mova      m4, [pw_64]
454
    mova      m5, [r5+ 0]
455
    mova      m6, [r5+16]
456
%ifdef m8
457
    mova      m8, [r5+32]
458
    mova      m9, [r5+48]
459
%endif
460
.nextrow
461
    movq      m0, [r2-1]
462
    movq      m1, [r2-0]
463
    movq      m2, [r2+1]
464
    movq      m3, [r2+2]
465
    punpcklbw m0, m7
466
    punpcklbw m1, m7
467
    punpcklbw m2, m7
468
    punpcklbw m3, m7
469
    pmullw    m0, m5
470
    pmullw    m1, m6
471
%ifdef m8
472
    pmullw    m2, m8
473
    pmullw    m3, m9
474
%else
475
    pmullw    m2, [r5+32]
476
    pmullw    m3, [r5+48]
477
%endif
478
    paddsw    m0, m1
479
    paddsw    m2, m3
480
    paddsw    m0, m2
481
    paddsw    m0, m4
482
    psraw     m0, 7
483
    packuswb  m0, m7
484
    movh    [r0], m0        ; store
485

    
486
    ; go to next line
487
    add       r0, r1
488
    add       r2, r3
489
    dec      r4d            ; next row
490
    jg .nextrow
491
    REP_RET
492

    
493
cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
494
    lea      r5d, [r5*3]
495
    shl      r5d, 4
496
%ifdef PIC
497
    lea      r11, [sixtap_filter_v_m]
498
%endif
499
    lea       r5, [sixtap_filter_v+r5-96]
500
    pxor      m7, m7
501
    mova      m6, [pw_64]
502
%ifdef m8
503
    mova      m8, [r5+ 0]
504
    mova      m9, [r5+16]
505
    mova     m10, [r5+32]
506
    mova     m11, [r5+48]
507
    mova     m12, [r5+64]
508
    mova     m13, [r5+80]
509
%endif
510
.nextrow
511
    movq      m0, [r2-2]
512
    movq      m1, [r2-1]
513
    movq      m2, [r2-0]
514
    movq      m3, [r2+1]
515
    movq      m4, [r2+2]
516
    movq      m5, [r2+3]
517
    punpcklbw m0, m7
518
    punpcklbw m1, m7
519
    punpcklbw m2, m7
520
    punpcklbw m3, m7
521
    punpcklbw m4, m7
522
    punpcklbw m5, m7
523
%ifdef m8
524
    pmullw    m0, m8
525
    pmullw    m1, m9
526
    pmullw    m2, m10
527
    pmullw    m3, m11
528
    pmullw    m4, m12
529
    pmullw    m5, m13
530
%else
531
    pmullw    m0, [r5+ 0]
532
    pmullw    m1, [r5+16]
533
    pmullw    m2, [r5+32]
534
    pmullw    m3, [r5+48]
535
    pmullw    m4, [r5+64]
536
    pmullw    m5, [r5+80]
537
%endif
538
    paddsw    m1, m4
539
    paddsw    m0, m5
540
    paddsw    m1, m2
541
    paddsw    m0, m3
542
    paddsw    m0, m1
543
    paddsw    m0, m6
544
    psraw     m0, 7
545
    packuswb  m0, m7
546
    movh    [r0], m0        ; store
547

    
548
    ; go to next line
549
    add       r0, r1
550
    add       r2, r3
551
    dec      r4d            ; next row
552
    jg .nextrow
553
    REP_RET
554

    
555
%macro FILTER_V 3
556
; 4x4 block, V-only 4-tap filter
557
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
558
    shl      r6d, 5
559
%ifdef PIC
560
    lea      r11, [fourtap_filter_v_m]
561
%endif
562
    lea       r6, [fourtap_filter_v+r6-32]
563
    mova      m6, [pw_64]
564
    pxor      m7, m7
565
    mova      m5, [r6+48]
566

    
567
    ; read 3 lines
568
    sub       r2, r3
569
    movh      m0, [r2]
570
    movh      m1, [r2+  r3]
571
    movh      m2, [r2+2*r3]
572
    add       r2, r3
573
    punpcklbw m0, m7
574
    punpcklbw m1, m7
575
    punpcklbw m2, m7
576

    
577
.nextrow
578
    ; first calculate negative taps (to prevent losing positive overflows)
579
    movh      m4, [r2+2*r3]                ; read new row
580
    punpcklbw m4, m7
581
    mova      m3, m4
582
    pmullw    m0, [r6+0]
583
    pmullw    m4, m5
584
    paddsw    m4, m0
585

    
586
    ; then calculate positive taps
587
    mova      m0, m1
588
    pmullw    m1, [r6+16]
589
    paddsw    m4, m1
590
    mova      m1, m2
591
    pmullw    m2, [r6+32]
592
    paddsw    m4, m2
593
    mova      m2, m3
594

    
595
    ; round/clip/store
596
    paddsw    m4, m6
597
    psraw     m4, 7
598
    packuswb  m4, m7
599
    movh    [r0], m4
600

    
601
    ; go to next line
602
    add       r0, r1
603
    add       r2, r3
604
    dec      r4d                           ; next row
605
    jg .nextrow
606
    REP_RET
607

    
608

    
609
; 4x4 block, V-only 6-tap filter
610
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
611
    shl      r6d, 4
612
    lea       r6, [r6*3]
613
%ifdef PIC
614
    lea      r11, [sixtap_filter_v_m]
615
%endif
616
    lea       r6, [sixtap_filter_v+r6-96]
617
    pxor      m7, m7
618

    
619
    ; read 5 lines
620
    sub       r2, r3
621
    sub       r2, r3
622
    movh      m0, [r2]
623
    movh      m1, [r2+r3]
624
    movh      m2, [r2+r3*2]
625
    lea       r2, [r2+r3*2]
626
    add       r2, r3
627
    movh      m3, [r2]
628
    movh      m4, [r2+r3]
629
    punpcklbw m0, m7
630
    punpcklbw m1, m7
631
    punpcklbw m2, m7
632
    punpcklbw m3, m7
633
    punpcklbw m4, m7
634

    
635
.nextrow
636
    ; first calculate negative taps (to prevent losing positive overflows)
637
    mova      m5, m1
638
    pmullw    m5, [r6+16]
639
    mova      m6, m4
640
    pmullw    m6, [r6+64]
641
    paddsw    m6, m5
642

    
643
    ; then calculate positive taps
644
    movh      m5, [r2+2*r3]                ; read new row
645
    punpcklbw m5, m7
646
    pmullw    m0, [r6+0]
647
    paddsw    m6, m0
648
    mova      m0, m1
649
    mova      m1, m2
650
    pmullw    m2, [r6+32]
651
    paddsw    m6, m2
652
    mova      m2, m3
653
    pmullw    m3, [r6+48]
654
    paddsw    m6, m3
655
    mova      m3, m4
656
    mova      m4, m5
657
    pmullw    m5, [r6+80]
658
    paddsw    m6, m5
659

    
660
    ; round/clip/store
661
    paddsw    m6, [pw_64]
662
    psraw     m6, 7
663
    packuswb  m6, m7
664
    movh    [r0], m6
665

    
666
    ; go to next line
667
    add       r0, r1
668
    add       r2, r3
669
    dec      r4d                           ; next row
670
    jg .nextrow
671
    REP_RET
672
%endmacro
673

    
674
INIT_MMX
675
FILTER_V mmxext, 4, 0
676
INIT_XMM
677
FILTER_V sse2,   8, 8
678

    
679
%macro FILTER_BILINEAR 3
680
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
681
    mov      r5d, 8*16
682
    shl      r6d, 4
683
    sub      r5d, r6d
684
%ifdef PIC
685
    lea      r11, [bilinear_filter_vw_m]
686
%endif
687
    pxor      m6, m6
688
    mova      m4, [bilinear_filter_vw+r5-16]
689
    mova      m5, [bilinear_filter_vw+r6-16]
690
.nextrow
691
    movh      m0, [r2+r3*0]
692
    movh      m1, [r2+r3*1]
693
    movh      m3, [r2+r3*2]
694
    punpcklbw m0, m6
695
    punpcklbw m1, m6
696
    punpcklbw m3, m6
697
    mova      m2, m1
698
    pmullw    m0, m4
699
    pmullw    m1, m5
700
    pmullw    m2, m4
701
    pmullw    m3, m5
702
    paddsw    m0, m1
703
    paddsw    m2, m3
704
    psraw     m0, 2
705
    psraw     m2, 2
706
    pavgw     m0, m6
707
    pavgw     m2, m6
708
%ifidn %1, mmxext
709
    packuswb  m0, m0
710
    packuswb  m2, m2
711
    movh [r0+r1*0], m0
712
    movh [r0+r1*1], m2
713
%else
714
    packuswb  m0, m2
715
    movh   [r0+r1*0], m0
716
    movhps [r0+r1*1], m0
717
%endif
718

    
719
    lea       r0, [r0+r1*2]
720
    lea       r2, [r2+r3*2]
721
    sub      r4d, 2
722
    jg .nextrow
723
    REP_RET
724

    
725
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
726
    mov      r6d, 8*16
727
    shl      r5d, 4
728
    sub      r6d, r5d
729
%ifdef PIC
730
    lea      r11, [bilinear_filter_vw_m]
731
%endif
732
    pxor      m6, m6
733
    mova      m4, [bilinear_filter_vw+r6-16]
734
    mova      m5, [bilinear_filter_vw+r5-16]
735
.nextrow
736
    movh      m0, [r2+r3*0+0]
737
    movh      m1, [r2+r3*0+1]
738
    movh      m2, [r2+r3*1+0]
739
    movh      m3, [r2+r3*1+1]
740
    punpcklbw m0, m6
741
    punpcklbw m1, m6
742
    punpcklbw m2, m6
743
    punpcklbw m3, m6
744
    pmullw    m0, m4
745
    pmullw    m1, m5
746
    pmullw    m2, m4
747
    pmullw    m3, m5
748
    paddsw    m0, m1
749
    paddsw    m2, m3
750
    psraw     m0, 2
751
    psraw     m2, 2
752
    pavgw     m0, m6
753
    pavgw     m2, m6
754
%ifidn %1, mmxext
755
    packuswb  m0, m0
756
    packuswb  m2, m2
757
    movh [r0+r1*0], m0
758
    movh [r0+r1*1], m2
759
%else
760
    packuswb  m0, m2
761
    movh   [r0+r1*0], m0
762
    movhps [r0+r1*1], m0
763
%endif
764

    
765
    lea       r0, [r0+r1*2]
766
    lea       r2, [r2+r3*2]
767
    sub      r4d, 2
768
    jg .nextrow
769
    REP_RET
770
%endmacro
771

    
772
INIT_MMX
773
FILTER_BILINEAR mmxext, 4, 0
774
INIT_XMM
775
FILTER_BILINEAR   sse2, 8, 7
776

    
777
%macro FILTER_BILINEAR_SSSE3 1
778
cglobal put_vp8_bilinear%1_v_ssse3, 7,7
779
    shl      r6d, 4
780
%ifdef PIC
781
    lea      r11, [bilinear_filter_vb_m]
782
%endif
783
    pxor      m4, m4
784
    mova      m3, [bilinear_filter_vb+r6-16]
785
.nextrow
786
    movh      m0, [r2+r3*0]
787
    movh      m1, [r2+r3*1]
788
    movh      m2, [r2+r3*2]
789
    punpcklbw m0, m1
790
    punpcklbw m1, m2
791
    pmaddubsw m0, m3
792
    pmaddubsw m1, m3
793
    psraw     m0, 2
794
    psraw     m1, 2
795
    pavgw     m0, m4
796
    pavgw     m1, m4
797
%if mmsize==8
798
    packuswb  m0, m0
799
    packuswb  m1, m1
800
    movh [r0+r1*0], m0
801
    movh [r0+r1*1], m1
802
%else
803
    packuswb  m0, m1
804
    movh   [r0+r1*0], m0
805
    movhps [r0+r1*1], m0
806
%endif
807

    
808
    lea       r0, [r0+r1*2]
809
    lea       r2, [r2+r3*2]
810
    sub      r4d, 2
811
    jg .nextrow
812
    REP_RET
813

    
814
cglobal put_vp8_bilinear%1_h_ssse3, 7,7
815
    shl      r5d, 4
816
%ifdef PIC
817
    lea      r11, [bilinear_filter_vb_m]
818
%endif
819
    pxor      m4, m4
820
    mova      m2, [filter_h2_shuf]
821
    mova      m3, [bilinear_filter_vb+r5-16]
822
.nextrow
823
    movu      m0, [r2+r3*0]
824
    movu      m1, [r2+r3*1]
825
    pshufb    m0, m2
826
    pshufb    m1, m2
827
    pmaddubsw m0, m3
828
    pmaddubsw m1, m3
829
    psraw     m0, 2
830
    psraw     m1, 2
831
    pavgw     m0, m4
832
    pavgw     m1, m4
833
%if mmsize==8
834
    packuswb  m0, m0
835
    packuswb  m1, m1
836
    movh [r0+r1*0], m0
837
    movh [r0+r1*1], m1
838
%else
839
    packuswb  m0, m1
840
    movh   [r0+r1*0], m0
841
    movhps [r0+r1*1], m0
842
%endif
843

    
844
    lea       r0, [r0+r1*2]
845
    lea       r2, [r2+r3*2]
846
    sub      r4d, 2
847
    jg .nextrow
848
    REP_RET
849
%endmacro
850

    
851
INIT_MMX
852
FILTER_BILINEAR_SSSE3 4
853
INIT_XMM
854
FILTER_BILINEAR_SSSE3 8
855

    
856
cglobal put_vp8_pixels8_mmx, 5,5
857
.nextrow:
858
    movq  mm0, [r2+r3*0]
859
    movq  mm1, [r2+r3*1]
860
    lea    r2, [r2+r3*2]
861
    movq [r0+r1*0], mm0
862
    movq [r0+r1*1], mm1
863
    lea    r0, [r0+r1*2]
864
    sub   r4d, 2
865
    jg .nextrow
866
    REP_RET
867

    
868
cglobal put_vp8_pixels16_mmx, 5,5
869
.nextrow:
870
    movq  mm0, [r2+r3*0+0]
871
    movq  mm1, [r2+r3*0+8]
872
    movq  mm2, [r2+r3*1+0]
873
    movq  mm3, [r2+r3*1+8]
874
    lea    r2, [r2+r3*2]
875
    movq [r0+r1*0+0], mm0
876
    movq [r0+r1*0+8], mm1
877
    movq [r0+r1*1+0], mm2
878
    movq [r0+r1*1+8], mm3
879
    lea    r0, [r0+r1*2]
880
    sub   r4d, 2
881
    jg .nextrow
882
    REP_RET
883

    
884
cglobal put_vp8_pixels16_sse, 5,5,2
885
.nextrow:
886
    movups xmm0, [r2+r3*0]
887
    movups xmm1, [r2+r3*1]
888
    lea     r2, [r2+r3*2]
889
    movaps [r0+r1*0], xmm0
890
    movaps [r0+r1*1], xmm1
891
    lea     r0, [r0+r1*2]
892
    sub    r4d, 2
893
    jg .nextrow
894
    REP_RET
895

    
896
;-----------------------------------------------------------------------------
897
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
898
;-----------------------------------------------------------------------------
899

    
900
%macro ADD_DC 4
901
    %4        m2, [r0+%3]
902
    %4        m3, [r0+r2+%3]
903
    %4        m4, [r1+%3]
904
    %4        m5, [r1+r2+%3]
905
    paddusb   m2, %1
906
    paddusb   m3, %1
907
    paddusb   m4, %1
908
    paddusb   m5, %1
909
    psubusb   m2, %2
910
    psubusb   m3, %2
911
    psubusb   m4, %2
912
    psubusb   m5, %2
913
    %4    [r0+%3], m2
914
    %4 [r0+r2+%3], m3
915
    %4    [r1+%3], m4
916
    %4 [r1+r2+%3], m5
917
%endmacro
918

    
919
INIT_MMX
920
cglobal vp8_idct_dc_add_mmx, 3, 3
921
    ; load data
922
    movd       m0, [r1]
923

    
924
    ; calculate DC
925
    paddw      m0, [pw_4]
926
    pxor       m1, m1
927
    psraw      m0, 3
928
    movd      [r1], m1
929
    psubw      m1, m0
930
    packuswb   m0, m0
931
    packuswb   m1, m1
932
    punpcklbw  m0, m0
933
    punpcklbw  m1, m1
934
    punpcklwd  m0, m0
935
    punpcklwd  m1, m1
936

    
937
    ; add DC
938
    lea        r1, [r0+r2*2]
939
    ADD_DC     m0, m1, 0, movh
940
    RET
941

    
942
INIT_XMM
943
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
944
    ; load data
945
    movd       m0, [r1]
946
    pxor       m1, m1
947

    
948
    ; calculate DC
949
    paddw      m0, [pw_4]
950
    movd     [r1], m1
951
    lea        r1, [r0+r2*2]
952
    movd       m2, [r0]
953
    movd       m3, [r0+r2]
954
    movd       m4, [r1]
955
    movd       m5, [r1+r2]
956
    psraw      m0, 3
957
    pshuflw    m0, m0, 0
958
    punpcklqdq m0, m0
959
    punpckldq  m2, m3
960
    punpckldq  m4, m5
961
    punpcklbw  m2, m1
962
    punpcklbw  m4, m1
963
    paddw      m2, m0
964
    paddw      m4, m0
965
    packuswb   m2, m4
966
    movd      [r0], m2
967
    pextrd [r0+r2], m2, 1
968
    pextrd    [r1], m2, 2
969
    pextrd [r1+r2], m2, 3
970
    RET
971

    
972
;-----------------------------------------------------------------------------
973
; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
974
;-----------------------------------------------------------------------------
975

    
976
INIT_MMX
977
cglobal vp8_idct_dc_add4y_mmx, 3, 3
978
    ; load data
979
    movd      m0, [r1+32*0] ; A
980
    movd      m1, [r1+32*2] ; C
981
    punpcklwd m0, [r1+32*1] ; A B
982
    punpcklwd m1, [r1+32*3] ; C D
983
    punpckldq m0, m1        ; A B C D
984
    pxor      m6, m6
985

    
986
    ; calculate DC
987
    paddw     m0, [pw_4]
988
    movd [r1+32*0], m6
989
    movd [r1+32*1], m6
990
    movd [r1+32*2], m6
991
    movd [r1+32*3], m6
992
    psraw     m0, 3
993
    psubw     m6, m0
994
    packuswb  m0, m0
995
    packuswb  m6, m6
996
    punpcklbw m0, m0 ; AABBCCDD
997
    punpcklbw m6, m6 ; AABBCCDD
998
    movq      m1, m0
999
    movq      m7, m6
1000
    punpcklbw m0, m0 ; AAAABBBB
1001
    punpckhbw m1, m1 ; CCCCDDDD
1002
    punpcklbw m6, m6 ; AAAABBBB
1003
    punpckhbw m7, m7 ; CCCCDDDD
1004

    
1005
    ; add DC
1006
    lea       r1, [r0+r2*2]
1007
    ADD_DC    m0, m6, 0, mova
1008
    ADD_DC    m1, m7, 8, mova
1009
    RET
1010

    
1011
INIT_XMM
1012
cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
1013
    ; load data
1014
    movd      m0, [r1+32*0] ; A
1015
    movd      m1, [r1+32*2] ; C
1016
    punpcklwd m0, [r1+32*1] ; A B
1017
    punpcklwd m1, [r1+32*3] ; C D
1018
    punpckldq m0, m1        ; A B C D
1019
    pxor      m1, m1
1020

    
1021
    ; calculate DC
1022
    paddw     m0, [pw_4]
1023
    movd [r1+32*0], m1
1024
    movd [r1+32*1], m1
1025
    movd [r1+32*2], m1
1026
    movd [r1+32*3], m1
1027
    psraw     m0, 3
1028
    psubw     m1, m0
1029
    packuswb  m0, m0
1030
    packuswb  m1, m1
1031
    punpcklbw m0, m0
1032
    punpcklbw m1, m1
1033
    punpcklbw m0, m0
1034
    punpcklbw m1, m1
1035

    
1036
    ; add DC
1037
    lea       r1, [r0+r2*2]
1038
    ADD_DC    m0, m1, 0, mova
1039
    RET
1040

    
1041
;-----------------------------------------------------------------------------
1042
; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
1043
;-----------------------------------------------------------------------------
1044

    
1045
INIT_MMX
1046
cglobal vp8_idct_dc_add4uv_mmx, 3, 3
1047
    ; load data
1048
    movd      m0, [r1+32*0] ; A
1049
    movd      m1, [r1+32*2] ; C
1050
    punpcklwd m0, [r1+32*1] ; A B
1051
    punpcklwd m1, [r1+32*3] ; C D
1052
    punpckldq m0, m1        ; A B C D
1053
    pxor      m6, m6
1054

    
1055
    ; calculate DC
1056
    paddw     m0, [pw_4]
1057
    movd [r1+32*0], m6
1058
    movd [r1+32*1], m6
1059
    movd [r1+32*2], m6
1060
    movd [r1+32*3], m6
1061
    psraw     m0, 3
1062
    psubw     m6, m0
1063
    packuswb  m0, m0
1064
    packuswb  m6, m6
1065
    punpcklbw m0, m0 ; AABBCCDD
1066
    punpcklbw m6, m6 ; AABBCCDD
1067
    movq      m1, m0
1068
    movq      m7, m6
1069
    punpcklbw m0, m0 ; AAAABBBB
1070
    punpckhbw m1, m1 ; CCCCDDDD
1071
    punpcklbw m6, m6 ; AAAABBBB
1072
    punpckhbw m7, m7 ; CCCCDDDD
1073

    
1074
    ; add DC
1075
    lea       r1, [r0+r2*2]
1076
    ADD_DC    m0, m6, 0, mova
1077
    lea       r0, [r0+r2*4]
1078
    lea       r1, [r1+r2*4]
1079
    ADD_DC    m1, m7, 0, mova
1080
    RET
1081

    
1082
;-----------------------------------------------------------------------------
1083
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
1084
;-----------------------------------------------------------------------------
1085

    
1086
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
1087
;           this macro assumes that m6/m7 have words for 20091/17734 loaded
1088
%macro VP8_MULTIPLY_SUMSUB 4
1089
    mova      %3, %1
1090
    mova      %4, %2
1091
    pmulhw    %3, m6 ;20091(1)
1092
    pmulhw    %4, m6 ;20091(2)
1093
    paddw     %3, %1
1094
    paddw     %4, %2
1095
    paddw     %1, %1
1096
    paddw     %2, %2
1097
    pmulhw    %1, m7 ;35468(1)
1098
    pmulhw    %2, m7 ;35468(2)
1099
    psubw     %1, %4
1100
    paddw     %2, %3
1101
%endmacro
1102

    
1103
; calculate x0=%1+%3; x1=%1-%3
1104
;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
1105
;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
1106
;           %5/%6 are temporary registers
1107
;           we assume m6/m7 have constant words 20091/17734 loaded in them
1108
%macro VP8_IDCT_TRANSFORM4x4_1D 6
1109
    SUMSUB_BA           m%3, m%1, m%5     ;t0, t1
1110
    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
1111
    SUMSUB_BA           m%4, m%3, m%5     ;tmp0, tmp3
1112
    SUMSUB_BA           m%2, m%1, m%5     ;tmp1, tmp2
1113
    SWAP                 %4,  %1
1114
    SWAP                 %4,  %3
1115
%endmacro
1116

    
1117
INIT_MMX
1118
%macro VP8_IDCT_ADD 1
1119
cglobal vp8_idct_add_%1, 3, 3
1120
    ; load block data
1121
    movq         m0, [r1+ 0]
1122
    movq         m1, [r1+ 8]
1123
    movq         m2, [r1+16]
1124
    movq         m3, [r1+24]
1125
    movq         m6, [pw_20091]
1126
    movq         m7, [pw_17734]
1127
%ifidn %1, sse
1128
    xorps      xmm0, xmm0
1129
    movaps  [r1+ 0], xmm0
1130
    movaps  [r1+16], xmm0
1131
%else
1132
    pxor         m4, m4
1133
    movq    [r1+ 0], m4
1134
    movq    [r1+ 8], m4
1135
    movq    [r1+16], m4
1136
    movq    [r1+24], m4
1137
%endif
1138

    
1139
    ; actual IDCT
1140
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1141
    TRANSPOSE4x4W            0, 1, 2, 3, 4
1142
    paddw        m0, [pw_4]
1143
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1144
    TRANSPOSE4x4W            0, 1, 2, 3, 4
1145

    
1146
    ; store
1147
    pxor         m4, m4
1148
    lea          r1, [r0+2*r2]
1149
    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
1150
    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
1151

    
1152
    RET
1153
%endmacro
1154

    
1155
VP8_IDCT_ADD mmx
1156
VP8_IDCT_ADD sse
1157

    
1158
;-----------------------------------------------------------------------------
1159
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
1160
;-----------------------------------------------------------------------------
1161

    
1162
%macro SCATTER_WHT 3
1163
    movd  r1d, m%1
1164
    movd  r2d, m%2
1165
    mov [r0+2*16*(0+%3)], r1w
1166
    mov [r0+2*16*(1+%3)], r2w
1167
    shr   r1d, 16
1168
    shr   r2d, 16
1169
    psrlq m%1, 32
1170
    psrlq m%2, 32
1171
    mov [r0+2*16*(4+%3)], r1w
1172
    mov [r0+2*16*(5+%3)], r2w
1173
    movd  r1d, m%1
1174
    movd  r2d, m%2
1175
    mov [r0+2*16*(8+%3)], r1w
1176
    mov [r0+2*16*(9+%3)], r2w
1177
    shr   r1d, 16
1178
    shr   r2d, 16
1179
    mov [r0+2*16*(12+%3)], r1w
1180
    mov [r0+2*16*(13+%3)], r2w
1181
%endmacro
1182

    
1183
%macro HADAMARD4_1D 4
1184
    SUMSUB_BADC m%2, m%1, m%4, m%3
1185
    SUMSUB_BADC m%4, m%2, m%3, m%1
1186
    SWAP %1, %4, %3
1187
%endmacro
1188

    
1189
%macro VP8_DC_WHT 1
1190
cglobal vp8_luma_dc_wht_%1, 2,3
1191
    movq          m0, [r1]
1192
    movq          m1, [r1+8]
1193
    movq          m2, [r1+16]
1194
    movq          m3, [r1+24]
1195
%ifidn %1, sse
1196
    xorps      xmm0, xmm0
1197
    movaps  [r1+ 0], xmm0
1198
    movaps  [r1+16], xmm0
1199
%else
1200
    pxor         m4, m4
1201
    movq    [r1+ 0], m4
1202
    movq    [r1+ 8], m4
1203
    movq    [r1+16], m4
1204
    movq    [r1+24], m4
1205
%endif
1206
    HADAMARD4_1D  0, 1, 2, 3
1207
    TRANSPOSE4x4W 0, 1, 2, 3, 4
1208
    paddw         m0, [pw_3]
1209
    HADAMARD4_1D  0, 1, 2, 3
1210
    psraw         m0, 3
1211
    psraw         m1, 3
1212
    psraw         m2, 3
1213
    psraw         m3, 3
1214
    SCATTER_WHT   0, 1, 0
1215
    SCATTER_WHT   2, 3, 2
1216
    RET
1217
%endmacro
1218

    
1219
INIT_MMX
1220
VP8_DC_WHT mmx
1221
VP8_DC_WHT sse
1222

    
1223
;-----------------------------------------------------------------------------
1224
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
1225
;-----------------------------------------------------------------------------
1226

    
1227
; macro called with 7 mm register indexes as argument, and 4 regular registers
1228
;
1229
; first 4 mm registers will carry the transposed pixel data
1230
; the other three are scratchspace (one would be sufficient, but this allows
1231
; for more spreading/pipelining and thus faster execution on OOE CPUs)
1232
;
1233
; first two regular registers are buf+4*stride and buf+5*stride
1234
; third is -stride, fourth is +stride
1235
%macro READ_8x4_INTERLEAVED 11
1236
    ; interleave 8 (A-H) rows of 4 pixels each
1237
    movd          m%1, [%8+%10*4]   ; A0-3
1238
    movd          m%5, [%9+%10*4]   ; B0-3
1239
    movd          m%2, [%8+%10*2]   ; C0-3
1240
    movd          m%6, [%8+%10]     ; D0-3
1241
    movd          m%3, [%8]         ; E0-3
1242
    movd          m%7, [%9]         ; F0-3
1243
    movd          m%4, [%9+%11]     ; G0-3
1244
    punpcklbw     m%1, m%5          ; A/B interleaved
1245
    movd          m%5, [%9+%11*2]   ; H0-3
1246
    punpcklbw     m%2, m%6          ; C/D interleaved
1247
    punpcklbw     m%3, m%7          ; E/F interleaved
1248
    punpcklbw     m%4, m%5          ; G/H interleaved
1249
%endmacro
1250

    
1251
; macro called with 7 mm register indexes as argument, and 5 regular registers
1252
; first 11 mean the same as READ_8x4_TRANSPOSED above
1253
; fifth regular register is scratchspace to reach the bottom 8 rows, it
1254
; will be set to second regular register + 8*stride at the end
1255
%macro READ_16x4_INTERLEAVED 12
1256
    ; transpose 16 (A-P) rows of 4 pixels each
1257
    lea           %12, [r0+8*r2]
1258

    
1259
    ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
1260
    movd          m%1, [%8+%10*4]   ; A0-3
1261
    movd          m%3, [%12+%10*4]  ; I0-3
1262
    movd          m%2, [%8+%10*2]   ; C0-3
1263
    movd          m%4, [%12+%10*2]  ; K0-3
1264
    movd          m%6, [%8+%10]     ; D0-3
1265
    movd          m%5, [%12+%10]    ; L0-3
1266
    movd          m%7, [%12]        ; M0-3
1267
    add           %12, %11
1268
    punpcklbw     m%1, m%3          ; A/I
1269
    movd          m%3, [%8]         ; E0-3
1270
    punpcklbw     m%2, m%4          ; C/K
1271
    punpcklbw     m%6, m%5          ; D/L
1272
    punpcklbw     m%3, m%7          ; E/M
1273
    punpcklbw     m%2, m%6          ; C/D/K/L interleaved
1274

    
1275
    ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
1276
    movd         m%5, [%9+%10*4]   ; B0-3
1277
    movd         m%4, [%12+%10*4]  ; J0-3
1278
    movd         m%7, [%9]         ; F0-3
1279
    movd         m%6, [%12]        ; N0-3
1280
    punpcklbw    m%5, m%4          ; B/J
1281
    punpcklbw    m%7, m%6          ; F/N
1282
    punpcklbw    m%1, m%5          ; A/B/I/J interleaved
1283
    punpcklbw    m%3, m%7          ; E/F/M/N interleaved
1284
    movd         m%4, [%9+%11]     ; G0-3
1285
    movd         m%6, [%12+%11]    ; O0-3
1286
    movd         m%5, [%9+%11*2]   ; H0-3
1287
    movd         m%7, [%12+%11*2]  ; P0-3
1288
    punpcklbw    m%4, m%6          ; G/O
1289
    punpcklbw    m%5, m%7          ; H/P
1290
    punpcklbw    m%4, m%5          ; G/H/O/P interleaved
1291
%endmacro
1292

    
1293
; write 4 mm registers of 2 dwords each
1294
; first four arguments are mm register indexes containing source data
1295
; last four are registers containing buf+4*stride, buf+5*stride,
1296
; -stride and +stride
1297
%macro WRITE_4x2D 8
1298
    ; write out (2 dwords per register)
1299
    movd    [%5+%7*4], m%1
1300
    movd    [%5+%7*2], m%2
1301
    movd         [%5], m%3
1302
    movd      [%6+%8], m%4
1303
    punpckhdq     m%1, m%1
1304
    punpckhdq     m%2, m%2
1305
    punpckhdq     m%3, m%3
1306
    punpckhdq     m%4, m%4
1307
    movd    [%6+%7*4], m%1
1308
    movd      [%5+%7], m%2
1309
    movd         [%6], m%3
1310
    movd    [%6+%8*2], m%4
1311
%endmacro
1312

    
1313
; write 4 xmm registers of 4 dwords each
1314
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
1315
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
1316
; we add 1*stride to the third regular registry in the process
1317
; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
1318
; same memory region), or 8 if they cover two separate buffers (third one points to
1319
; a different memory region than the first two), allowing for more optimal code for
1320
; the 16-width case
1321
%macro WRITE_4x4D 10
1322
    ; write out (4 dwords per register), start with dwords zero
1323
    movd    [%5+%8*4], m%1
1324
    movd         [%5], m%2
1325
    movd    [%7+%8*4], m%3
1326
    movd         [%7], m%4
1327

    
1328
    ; store dwords 1
1329
    psrldq        m%1, 4
1330
    psrldq        m%2, 4
1331
    psrldq        m%3, 4
1332
    psrldq        m%4, 4
1333
    movd    [%6+%8*4], m%1
1334
    movd         [%6], m%2
1335
%if %10 == 16
1336
    movd    [%6+%9*4], m%3
1337
%endif
1338
    movd      [%7+%9], m%4
1339

    
1340
    ; write dwords 2
1341
    psrldq        m%1, 4
1342
    psrldq        m%2, 4
1343
%if %10 == 8
1344
    movd    [%5+%8*2], m%1
1345
    movd          %5d, m%3
1346
%endif
1347
    psrldq        m%3, 4
1348
    psrldq        m%4, 4
1349
%if %10 == 16
1350
    movd    [%5+%8*2], m%1
1351
%endif
1352
    movd      [%6+%9], m%2
1353
    movd    [%7+%8*2], m%3
1354
    movd    [%7+%9*2], m%4
1355
    add            %7, %9
1356

    
1357
    ; store dwords 3
1358
    psrldq        m%1, 4
1359
    psrldq        m%2, 4
1360
    psrldq        m%3, 4
1361
    psrldq        m%4, 4
1362
%if %10 == 8
1363
    mov     [%7+%8*4], %5d
1364
    movd    [%6+%8*2], m%1
1365
%else
1366
    movd      [%5+%8], m%1
1367
%endif
1368
    movd    [%6+%9*2], m%2
1369
    movd    [%7+%8*2], m%3
1370
    movd    [%7+%9*2], m%4
1371
%endmacro
1372

    
1373
; write 4 or 8 words in the mmx/xmm registers as 8 lines
1374
; 1 and 2 are the registers to write, this can be the same (for SSE2)
1375
; for pre-SSE4:
1376
; 3 is a general-purpose register that we will clobber
1377
; for SSE4:
1378
; 3 is a pointer to the destination's 5th line
1379
; 4 is a pointer to the destination's 4th line
1380
; 5/6 is -stride and +stride
1381
%macro WRITE_2x4W 6
1382
    movd            %3d, %1
1383
    punpckhdq        %1, %1
1384
    mov       [%4+%5*4], %3w
1385
    shr              %3, 16
1386
    add              %4, %6
1387
    mov       [%4+%5*4], %3w
1388

    
1389
    movd            %3d, %1
1390
    add              %4, %5
1391
    mov       [%4+%5*2], %3w
1392
    shr              %3, 16
1393
    mov       [%4+%5  ], %3w
1394

    
1395
    movd            %3d, %2
1396
    punpckhdq        %2, %2
1397
    mov       [%4     ], %3w
1398
    shr              %3, 16
1399
    mov       [%4+%6  ], %3w
1400

    
1401
    movd            %3d, %2
1402
    add              %4, %6
1403
    mov       [%4+%6  ], %3w
1404
    shr              %3, 16
1405
    mov       [%4+%6*2], %3w
1406
    add              %4, %5
1407
%endmacro
1408

    
1409
%macro WRITE_8W_SSE2 5
1410
    movd            %2d, %1
1411
    psrldq           %1, 4
1412
    mov       [%3+%4*4], %2w
1413
    shr              %2, 16
1414
    add              %3, %5
1415
    mov       [%3+%4*4], %2w
1416

    
1417
    movd            %2d, %1
1418
    psrldq           %1, 4
1419
    add              %3, %4
1420
    mov       [%3+%4*2], %2w
1421
    shr              %2, 16
1422
    mov       [%3+%4  ], %2w
1423

    
1424
    movd            %2d, %1
1425
    psrldq           %1, 4
1426
    mov       [%3     ], %2w
1427
    shr              %2, 16
1428
    mov       [%3+%5  ], %2w
1429

    
1430
    movd            %2d, %1
1431
    add              %3, %5
1432
    mov       [%3+%5  ], %2w
1433
    shr              %2, 16
1434
    mov       [%3+%5*2], %2w
1435
%endmacro
1436

    
1437
%macro WRITE_8W_SSE4 5
1438
    pextrw    [%3+%4*4], %1, 0
1439
    pextrw    [%2+%4*4], %1, 1
1440
    pextrw    [%3+%4*2], %1, 2
1441
    pextrw    [%3+%4  ], %1, 3
1442
    pextrw    [%3     ], %1, 4
1443
    pextrw    [%2     ], %1, 5
1444
    pextrw    [%2+%5  ], %1, 6
1445
    pextrw    [%2+%5*2], %1, 7
1446
%endmacro
1447

    
1448
%macro SPLATB_REG_MMX 2-3
1449
    movd           %1, %2d
1450
    punpcklbw      %1, %1
1451
    punpcklwd      %1, %1
1452
    punpckldq      %1, %1
1453
%endmacro
1454

    
1455
%macro SPLATB_REG_MMXEXT 2-3
1456
    movd           %1, %2d
1457
    punpcklbw      %1, %1
1458
    pshufw         %1, %1, 0x0
1459
%endmacro
1460

    
1461
%macro SPLATB_REG_SSE2 2-3
1462
    movd           %1, %2d
1463
    punpcklbw      %1, %1
1464
    pshuflw        %1, %1, 0x0
1465
    punpcklqdq     %1, %1
1466
%endmacro
1467

    
1468
%macro SPLATB_REG_SSSE3 3
1469
    movd           %1, %2d
1470
    pshufb         %1, %3
1471
%endmacro
1472

    
1473
%macro SIMPLE_LOOPFILTER 4
1474
cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
1475
%if mmsize == 8 ; mmx/mmxext
1476
    mov            r3, 2
1477
%endif
1478
%ifnidn %1, sse2
1479
%if mmsize == 16
1480
    pxor           m0, m0
1481
%endif
1482
%endif
1483
    SPLATB_REG     m7, r2, m0       ; splat "flim" into register
1484

    
1485
    ; set up indexes to address 4 rows
1486
    mov            r2, r1
1487
    neg            r1
1488
%ifidn %2, h
1489
    lea            r0, [r0+4*r2-2]
1490
%endif
1491

    
1492
%if mmsize == 8 ; mmx / mmxext
1493
.next8px
1494
%endif
1495
%ifidn %2, v
1496
    ; read 4 half/full rows of pixels
1497
    mova           m0, [r0+r1*2]    ; p1
1498
    mova           m1, [r0+r1]      ; p0
1499
    mova           m2, [r0]         ; q0
1500
    mova           m3, [r0+r2]      ; q1
1501
%else ; h
1502
    lea            r4, [r0+r2]
1503

    
1504
%if mmsize == 8 ; mmx/mmxext
1505
    READ_8x4_INTERLEAVED  0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
1506
%else ; sse2
1507
    READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
1508
%endif
1509
    TRANSPOSE4x4W         0, 1, 2, 3, 4
1510
%endif
1511

    
1512
    ; simple_limit
1513
    mova           m5, m2           ; m5=backup of q0
1514
    mova           m6, m1           ; m6=backup of p0
1515
    psubusb        m1, m2           ; p0-q0
1516
    psubusb        m2, m6           ; q0-p0
1517
    por            m1, m2           ; FFABS(p0-q0)
1518
    paddusb        m1, m1           ; m1=FFABS(p0-q0)*2
1519

    
1520
    mova           m4, m3
1521
    mova           m2, m0
1522
    psubusb        m3, m0           ; q1-p1
1523
    psubusb        m0, m4           ; p1-q1
1524
    por            m3, m0           ; FFABS(p1-q1)
1525
    mova           m0, [pb_80]
1526
    pxor           m2, m0
1527
    pxor           m4, m0
1528
    psubsb         m2, m4           ; m2=p1-q1 (signed) backup for below
1529
    pand           m3, [pb_FE]
1530
    psrlq          m3, 1            ; m3=FFABS(p1-q1)/2, this can be used signed
1531
    paddusb        m3, m1
1532
    psubusb        m3, m7
1533
    pxor           m1, m1
1534
    pcmpeqb        m3, m1           ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
1535

    
1536
    ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
1537
    mova           m4, m5
1538
    pxor           m5, m0
1539
    pxor           m0, m6
1540
    psubsb         m5, m0           ; q0-p0 (signed)
1541
    paddsb         m2, m5
1542
    paddsb         m2, m5
1543
    paddsb         m2, m5           ; a=(p1-q1) + 3*(q0-p0)
1544
    pand           m2, m3           ; apply filter mask (m3)
1545

    
1546
    mova           m3, [pb_F8]
1547
    mova           m1, m2
1548
    paddsb         m2, [pb_4]       ; f1<<3=a+4
1549
    paddsb         m1, [pb_3]       ; f2<<3=a+3
1550
    pand           m2, m3
1551
    pand           m1, m3           ; cache f2<<3
1552

    
1553
    pxor           m0, m0
1554
    pxor           m3, m3
1555
    pcmpgtb        m0, m2           ; which values are <0?
1556
    psubb          m3, m2           ; -f1<<3
1557
    psrlq          m2, 3            ; +f1
1558
    psrlq          m3, 3            ; -f1
1559
    pand           m3, m0
1560
    pandn          m0, m2
1561
    psubusb        m4, m0
1562
    paddusb        m4, m3           ; q0-f1
1563

    
1564
    pxor           m0, m0
1565
    pxor           m3, m3
1566
    pcmpgtb        m0, m1           ; which values are <0?
1567
    psubb          m3, m1           ; -f2<<3
1568
    psrlq          m1, 3            ; +f2
1569
    psrlq          m3, 3            ; -f2
1570
    pand           m3, m0
1571
    pandn          m0, m1
1572
    paddusb        m6, m0
1573
    psubusb        m6, m3           ; p0+f2
1574

    
1575
    ; store
1576
%ifidn %2, v
1577
    mova         [r0], m4
1578
    mova      [r0+r1], m6
1579
%else ; h
1580
    inc           r0
1581
    SBUTTERFLY    bw, 6, 4, 0
1582

    
1583
%if mmsize == 16 ; sse2
1584
%ifidn %1, sse4
1585
    inc            r4
1586
%endif
1587
    WRITE_8W       m6, r4, r0, r1, r2
1588
    lea            r4, [r3+r1+1]
1589
%ifidn %1, sse4
1590
    inc            r3
1591
%endif
1592
    WRITE_8W       m4, r3, r4, r1, r2
1593
%else ; mmx/mmxext
1594
    WRITE_2x4W     m6, m4, r4, r0, r1, r2
1595
%endif
1596
%endif
1597

    
1598
%if mmsize == 8 ; mmx/mmxext
1599
    ; next 8 pixels
1600
%ifidn %2, v
1601
    add            r0, 8            ; advance 8 cols = pixels
1602
%else ; h
1603
    lea            r0, [r0+r2*8-1]  ; advance 8 rows = lines
1604
%endif
1605
    dec            r3
1606
    jg .next8px
1607
    REP_RET
1608
%else ; sse2
1609
    RET
1610
%endif
1611
%endmacro
1612

    
1613
INIT_MMX
1614
%define SPLATB_REG SPLATB_REG_MMX
1615
SIMPLE_LOOPFILTER mmx,    v, 4, 0
1616
SIMPLE_LOOPFILTER mmx,    h, 5, 0
1617
%define SPLATB_REG SPLATB_REG_MMXEXT
1618
SIMPLE_LOOPFILTER mmxext, v, 4, 0
1619
SIMPLE_LOOPFILTER mmxext, h, 5, 0
1620
INIT_XMM
1621
%define SPLATB_REG SPLATB_REG_SSE2
1622
%define WRITE_8W   WRITE_8W_SSE2
1623
SIMPLE_LOOPFILTER sse2,   v, 3, 8
1624
SIMPLE_LOOPFILTER sse2,   h, 5, 8
1625
%define SPLATB_REG SPLATB_REG_SSSE3
1626
SIMPLE_LOOPFILTER ssse3,  v, 3, 8
1627
SIMPLE_LOOPFILTER ssse3,  h, 5, 8
1628
%define WRITE_8W   WRITE_8W_SSE4
1629
SIMPLE_LOOPFILTER sse4,   h, 5, 8
1630

    
1631
;-----------------------------------------------------------------------------
1632
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
1633
;                                            int flimE, int flimI, int hev_thr);
1634
;-----------------------------------------------------------------------------
1635

    
1636
%macro INNER_LOOPFILTER 5
1637
%if %4 == 8 ; chroma
1638
cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5
1639
%define dst8_reg    r1
1640
%define mstride_reg r2
1641
%define E_reg       r3
1642
%define I_reg       r4
1643
%define hev_thr_reg r5
1644
%else ; luma
1645
cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
1646
%define mstride_reg r1
1647
%define E_reg       r2
1648
%define I_reg       r3
1649
%define hev_thr_reg r4
1650
%ifdef m8 ; x86-64, sse2
1651
%define dst8_reg    r4
1652
%elif mmsize == 16 ; x86-32, sse2
1653
%define dst8_reg    r5
1654
%else ; x86-32, mmx/mmxext
1655
%define cnt_reg     r5
1656
%endif
1657
%endif
1658
%define dst_reg     r0
1659
%define stride_reg  E_reg
1660
%define dst2_reg    I_reg
1661
%ifndef m8
1662
%define stack_reg   hev_thr_reg
1663
%endif
1664

    
1665
%ifnidn %1, sse2
1666
%if mmsize == 16
1667
    pxor             m7, m7
1668
%endif
1669
%endif
1670

    
1671
%ifndef m8 ; mmx/mmxext or sse2 on x86-32
1672
    ; splat function arguments
1673
    SPLATB_REG       m0, E_reg, m7   ; E
1674
    SPLATB_REG       m1, I_reg, m7   ; I
1675
    SPLATB_REG       m2, hev_thr_reg, m7 ; hev_thresh
1676

    
1677
    ; align stack
1678
    mov       stack_reg, rsp         ; backup stack pointer
1679
    and             rsp, ~(mmsize-1) ; align stack
1680
%ifidn %2, v
1681
    sub             rsp, mmsize * 4  ; stack layout: [0]=E, [1]=I, [2]=hev_thr
1682
                                     ;               [3]=hev() result
1683
%else ; h
1684
    sub             rsp, mmsize * 5  ; extra storage space for transposes
1685
%endif
1686

    
1687
%define flim_E   [rsp]
1688
%define flim_I   [rsp+mmsize]
1689
%define hev_thr  [rsp+mmsize*2]
1690
%define mask_res [rsp+mmsize*3]
1691
%define p0backup [rsp+mmsize*3]
1692
%define q0backup [rsp+mmsize*4]
1693

    
1694
    mova         flim_E, m0
1695
    mova         flim_I, m1
1696
    mova        hev_thr, m2
1697

    
1698
%else ; sse2 on x86-64
1699

    
1700
%define flim_E   m9
1701
%define flim_I   m10
1702
%define hev_thr  m11
1703
%define mask_res m12
1704
%define p0backup m12
1705
%define q0backup m8
1706

    
1707
    ; splat function arguments
1708
    SPLATB_REG   flim_E, E_reg, m7   ; E
1709
    SPLATB_REG   flim_I, I_reg, m7   ; I
1710
    SPLATB_REG  hev_thr, hev_thr_reg, m7 ; hev_thresh
1711
%endif
1712

    
1713
%if mmsize == 8 && %4 == 16 ; mmx/mmxext
1714
    mov         cnt_reg, 2
1715
%endif
1716
    mov      stride_reg, mstride_reg
1717
    neg     mstride_reg
1718
%ifidn %2, h
1719
    lea         dst_reg, [dst_reg + stride_reg*4-4]
1720
%if %4 == 8
1721
    lea        dst8_reg, [dst8_reg+ stride_reg*4-4]
1722
%endif
1723
%endif
1724

    
1725
%if mmsize == 8
1726
.next8px
1727
%endif
1728
    ; read
1729
    lea        dst2_reg, [dst_reg + stride_reg]
1730
%ifidn %2, v
1731
%if %4 == 8 && mmsize == 16
1732
%define movrow movh
1733
%else
1734
%define movrow mova
1735
%endif
1736
    movrow           m0, [dst_reg +mstride_reg*4] ; p3
1737
    movrow           m1, [dst2_reg+mstride_reg*4] ; p2
1738
    movrow           m2, [dst_reg +mstride_reg*2] ; p1
1739
    movrow           m5, [dst2_reg]               ; q1
1740
    movrow           m6, [dst2_reg+ stride_reg]   ; q2
1741
    movrow           m7, [dst2_reg+ stride_reg*2] ; q3
1742
%if mmsize == 16 && %4 == 8
1743
    movhps           m0, [dst8_reg+mstride_reg*4]
1744
    movhps           m2, [dst8_reg+mstride_reg*2]
1745
    add        dst8_reg, stride_reg
1746
    movhps           m1, [dst8_reg+mstride_reg*4]
1747
    movhps           m5, [dst8_reg]
1748
    movhps           m6, [dst8_reg+ stride_reg]
1749
    movhps           m7, [dst8_reg+ stride_reg*2]
1750
    add        dst8_reg, mstride_reg
1751
%endif
1752
%elif mmsize == 8 ; mmx/mmxext (h)
1753
    ; read 8 rows of 8px each
1754
    movu             m0, [dst_reg +mstride_reg*4]
1755
    movu             m1, [dst2_reg+mstride_reg*4]
1756
    movu             m2, [dst_reg +mstride_reg*2]
1757
    movu             m3, [dst_reg +mstride_reg]
1758
    movu             m4, [dst_reg]
1759
    movu             m5, [dst2_reg]
1760
    movu             m6, [dst2_reg+ stride_reg]
1761

    
1762
    ; 8x8 transpose
1763
    TRANSPOSE4x4B     0, 1, 2, 3, 7
1764
    mova       q0backup, m1
1765
    movu             m7, [dst2_reg+ stride_reg*2]
1766
    TRANSPOSE4x4B     4, 5, 6, 7, 1
1767
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
1768
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
1769
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1770
    mova             m1, q0backup
1771
    mova       q0backup, m2          ; store q0
1772
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1773
    mova       p0backup, m5          ; store p0
1774
    SWAP              1, 4
1775
    SWAP              2, 4
1776
    SWAP              6, 3
1777
    SWAP              5, 3
1778
%else ; sse2 (h)
1779
%if %4 == 16
1780
    lea        dst8_reg, [dst_reg + stride_reg*8]
1781
%endif
1782

    
1783
    ; read 16 rows of 8px each, interleave
1784
    movh             m0, [dst_reg +mstride_reg*4]
1785
    movh             m1, [dst8_reg+mstride_reg*4]
1786
    movh             m2, [dst_reg +mstride_reg*2]
1787
    movh             m5, [dst8_reg+mstride_reg*2]
1788
    movh             m3, [dst_reg +mstride_reg]
1789
    movh             m6, [dst8_reg+mstride_reg]
1790
    movh             m4, [dst_reg]
1791
    movh             m7, [dst8_reg]
1792
    punpcklbw        m0, m1          ; A/I
1793
    punpcklbw        m2, m5          ; C/K
1794
    punpcklbw        m3, m6          ; D/L
1795
    punpcklbw        m4, m7          ; E/M
1796

    
1797
    add        dst8_reg, stride_reg
1798
    movh             m1, [dst2_reg+mstride_reg*4]
1799
    movh             m6, [dst8_reg+mstride_reg*4]
1800
    movh             m5, [dst2_reg]
1801
    movh             m7, [dst8_reg]
1802
    punpcklbw        m1, m6          ; B/J
1803
    punpcklbw        m5, m7          ; F/N
1804
    movh             m6, [dst2_reg+ stride_reg]
1805
    movh             m7, [dst8_reg+ stride_reg]
1806
    punpcklbw        m6, m7          ; G/O
1807

    
1808
    ; 8x16 transpose
1809
    TRANSPOSE4x4B     0, 1, 2, 3, 7
1810
%ifdef m8
1811
    SWAP              1, 8
1812
%else
1813
    mova       q0backup, m1
1814
%endif
1815
    movh             m7, [dst2_reg+ stride_reg*2]
1816
    movh             m1, [dst8_reg+ stride_reg*2]
1817
    punpcklbw        m7, m1          ; H/P
1818
    TRANSPOSE4x4B     4, 5, 6, 7, 1
1819
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
1820
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
1821
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1822
%ifdef m8
1823
    SWAP              1, 8
1824
    SWAP              2, 8
1825
%else
1826
    mova             m1, q0backup
1827
    mova       q0backup, m2          ; store q0
1828
%endif
1829
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1830
%ifdef m12
1831
    SWAP              5, 12
1832
%else
1833
    mova       p0backup, m5          ; store p0
1834
%endif
1835
    SWAP              1, 4
1836
    SWAP              2, 4
1837
    SWAP              6, 3
1838
    SWAP              5, 3
1839
%endif
1840

    
1841
    ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
1842
    mova             m4, m1
1843
    SWAP              4, 1
1844
    psubusb          m4, m0          ; p2-p3
1845
    psubusb          m0, m1          ; p3-p2
1846
    por              m0, m4          ; abs(p3-p2)
1847

    
1848
    mova             m4, m2
1849
    SWAP              4, 2
1850
    psubusb          m4, m1          ; p1-p2
1851
    psubusb          m1, m2          ; p2-p1
1852
    por              m1, m4          ; abs(p2-p1)
1853

    
1854
    mova             m4, m6
1855
    SWAP              4, 6
1856
    psubusb          m4, m7          ; q2-q3
1857
    psubusb          m7, m6          ; q3-q2
1858
    por              m7, m4          ; abs(q3-q2)
1859

    
1860
    mova             m4, m5
1861
    SWAP              4, 5
1862
    psubusb          m4, m6          ; q1-q2
1863
    psubusb          m6, m5          ; q2-q1
1864
    por              m6, m4          ; abs(q2-q1)
1865

    
1866
%ifidn %1, mmx
1867
    mova             m4, flim_I
1868
    pxor             m3, m3
1869
    psubusb          m0, m4
1870
    psubusb          m1, m4
1871
    psubusb          m7, m4
1872
    psubusb          m6, m4
1873
    pcmpeqb          m0, m3          ; abs(p3-p2) <= I
1874
    pcmpeqb          m1, m3          ; abs(p2-p1) <= I
1875
    pcmpeqb          m7, m3          ; abs(q3-q2) <= I
1876
    pcmpeqb          m6, m3          ; abs(q2-q1) <= I
1877
    pand             m0, m1
1878
    pand             m7, m6
1879
    pand             m0, m7
1880
%else ; mmxext/sse2
1881
    pmaxub           m0, m1
1882
    pmaxub           m6, m7
1883
    pmaxub           m0, m6
1884
%endif
1885

    
1886
    ; normal_limit and high_edge_variance for p1-p0, q1-q0
1887
    SWAP              7, 3           ; now m7 is zero
1888
%ifidn %2, v
1889
    movrow           m3, [dst_reg +mstride_reg] ; p0
1890
%if mmsize == 16 && %4 == 8
1891
    movhps           m3, [dst8_reg+mstride_reg]
1892
%endif
1893
%elifdef m12
1894
    SWAP              3, 12
1895
%else
1896
    mova             m3, p0backup
1897
%endif
1898

    
1899
    mova             m1, m2
1900
    SWAP              1, 2
1901
    mova             m6, m3
1902
    SWAP              3, 6
1903
    psubusb          m1, m3          ; p1-p0
1904
    psubusb          m6, m2          ; p0-p1
1905
    por              m1, m6          ; abs(p1-p0)
1906
%ifidn %1, mmx
1907
    mova             m6, m1
1908
    psubusb          m1, m4
1909
    psubusb          m6, hev_thr
1910
    pcmpeqb          m1, m7          ; abs(p1-p0) <= I
1911
    pcmpeqb          m6, m7          ; abs(p1-p0) <= hev_thresh
1912
    pand             m0, m1
1913
    mova       mask_res, m6
1914
%else ; mmxext/sse2
1915
    pmaxub           m0, m1          ; max_I
1916
    SWAP              1, 4           ; max_hev_thresh
1917
%endif
1918

    
1919
    SWAP              6, 4           ; now m6 is I
1920
%ifidn %2, v
1921
    movrow           m4, [dst_reg]   ; q0
1922
%if mmsize == 16 && %4 == 8
1923
    movhps           m4, [dst8_reg]
1924
%endif
1925
%elifdef m8
1926
    SWAP              4, 8
1927
%else
1928
    mova             m4, q0backup
1929
%endif
1930
    mova             m1, m4
1931
    SWAP              1, 4
1932
    mova             m7, m5
1933
    SWAP              7, 5
1934
    psubusb          m1, m5          ; q0-q1
1935
    psubusb          m7, m4          ; q1-q0
1936
    por              m1, m7          ; abs(q1-q0)
1937
%ifidn %1, mmx
1938
    mova             m7, m1
1939
    psubusb          m1, m6
1940
    psubusb          m7, hev_thr
1941
    pxor             m6, m6
1942
    pcmpeqb          m1, m6          ; abs(q1-q0) <= I
1943
    pcmpeqb          m7, m6          ; abs(q1-q0) <= hev_thresh
1944
    mova             m6, mask_res
1945
    pand             m0, m1          ; abs([pq][321]-[pq][210]) <= I
1946
    pand             m6, m7
1947
%else ; mmxext/sse2
1948
    pxor             m7, m7
1949
    pmaxub           m0, m1
1950
    pmaxub           m6, m1
1951
    psubusb          m0, flim_I
1952
    psubusb          m6, hev_thr
1953
    pcmpeqb          m0, m7          ; max(abs(..)) <= I
1954
    pcmpeqb          m6, m7          ; !(max(abs..) > thresh)
1955
%endif
1956
%ifdef m12
1957
    SWAP              6, 12
1958
%else
1959
    mova       mask_res, m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1960
%endif
1961

    
1962
    ; simple_limit
1963
    mova             m1, m3
1964
    SWAP              1, 3
1965
    mova             m6, m4          ; keep copies of p0/q0 around for later use
1966
    SWAP              6, 4
1967
    psubusb          m1, m4          ; p0-q0
1968
    psubusb          m6, m3          ; q0-p0
1969
    por              m1, m6          ; abs(q0-p0)
1970
    paddusb          m1, m1          ; m1=2*abs(q0-p0)
1971

    
1972
    mova             m7, m2
1973
    SWAP              7, 2
1974
    mova             m6, m5
1975
    SWAP              6, 5
1976
    psubusb          m7, m5          ; p1-q1
1977
    psubusb          m6, m2          ; q1-p1
1978
    por              m7, m6          ; abs(q1-p1)
1979
    pxor             m6, m6
1980
    pand             m7, [pb_FE]
1981
    psrlq            m7, 1           ; abs(q1-p1)/2
1982
    paddusb          m7, m1          ; abs(q0-p0)*2+abs(q1-p1)/2
1983
    psubusb          m7, flim_E
1984
    pcmpeqb          m7, m6          ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
1985
    pand             m0, m7          ; normal_limit result
1986

    
1987
    ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
1988
%ifdef m8 ; x86-64 && sse2
1989
    mova             m8, [pb_80]
1990
%define pb_80_var m8
1991
%else ; x86-32 or mmx/mmxext
1992
%define pb_80_var [pb_80]
1993
%endif
1994
    mova             m1, m4
1995
    mova             m7, m3
1996
    pxor             m1, pb_80_var
1997
    pxor             m7, pb_80_var
1998
    psubsb           m1, m7          ; (signed) q0-p0
1999
    mova             m6, m2
2000
    mova             m7, m5
2001
    pxor             m6, pb_80_var
2002
    pxor             m7, pb_80_var
2003
    psubsb           m6, m7          ; (signed) p1-q1
2004
    mova             m7, mask_res
2005
    pandn            m7, m6
2006
    paddsb           m7, m1
2007
    paddsb           m7, m1
2008
    paddsb           m7, m1          ; 3*(q0-p0)+is4tap?(p1-q1)
2009

    
2010
    pand             m7, m0
2011
    mova             m1, [pb_F8]
2012
    mova             m6, m7
2013
    paddsb           m7, [pb_3]
2014
    paddsb           m6, [pb_4]
2015
    pand             m7, m1
2016
    pand             m6, m1
2017

    
2018
    pxor             m1, m1
2019
    pxor             m0, m0
2020
    pcmpgtb          m1, m7
2021
    psubb            m0, m7
2022
    psrlq            m7, 3           ; +f2
2023
    psrlq            m0, 3           ; -f2
2024
    pand             m0, m1
2025
    pandn            m1, m7
2026
    psubusb          m3, m0
2027
    paddusb          m3, m1          ; p0+f2
2028

    
2029
    pxor             m1, m1
2030
    pxor             m0, m0
2031
    pcmpgtb          m0, m6
2032
    psubb            m1, m6
2033
    psrlq            m6, 3           ; +f1
2034
    psrlq            m1, 3           ; -f1
2035
    pand             m1, m0
2036
    pandn            m0, m6
2037
    psubusb          m4, m0
2038
    paddusb          m4, m1          ; q0-f1
2039

    
2040
%ifdef m12
2041
    SWAP              6, 12
2042
%else
2043
    mova             m6, mask_res
2044
%endif
2045
%ifidn %1, mmx
2046
    mova             m7, [pb_1]
2047
%else ; mmxext/sse2
2048
    pxor             m7, m7
2049
%endif
2050
    pand             m0, m6
2051
    pand             m1, m6
2052
%ifidn %1, mmx
2053
    paddusb          m0, m7
2054
    pand             m1, [pb_FE]
2055
    pandn            m7, m0
2056
    psrlq            m1, 1
2057
    psrlq            m7, 1
2058
    SWAP              0, 7
2059
%else ; mmxext/sse2
2060
    psubusb          m1, [pb_1]
2061
    pavgb            m0, m7          ; a
2062
    pavgb            m1, m7          ; -a
2063
%endif
2064
    psubusb          m5, m0
2065
    psubusb          m2, m1
2066
    paddusb          m5, m1          ; q1-a
2067
    paddusb          m2, m0          ; p1+a
2068

    
2069
    ; store
2070
%ifidn %2, v
2071
    movrow [dst_reg +mstride_reg*2], m2
2072
    movrow [dst_reg +mstride_reg  ], m3
2073
    movrow    [dst_reg], m4
2074
    movrow [dst_reg + stride_reg  ], m5
2075
%if mmsize == 16 && %4 == 8
2076
    movhps [dst8_reg+mstride_reg*2], m2
2077
    movhps [dst8_reg+mstride_reg  ], m3
2078
    movhps   [dst8_reg], m4
2079
    movhps [dst8_reg+ stride_reg  ], m5
2080
%endif
2081
%else ; h
2082
    add         dst_reg, 2
2083
    add        dst2_reg, 2
2084

    
2085
    ; 4x8/16 transpose
2086
    TRANSPOSE4x4B     2, 3, 4, 5, 6
2087

    
2088
%if mmsize == 8 ; mmx/mmxext (h)
2089
    WRITE_4x2D        2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
2090
%else ; sse2 (h)
2091
    lea        dst8_reg, [dst8_reg+mstride_reg+2]
2092
    WRITE_4x4D        2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
2093
%endif
2094
%endif
2095

    
2096
%if mmsize == 8
2097
%if %4 == 8 ; chroma
2098
%ifidn %2, h
2099
    sub         dst_reg, 2
2100
%endif
2101
    cmp         dst_reg, dst8_reg
2102
    mov         dst_reg, dst8_reg
2103
    jnz .next8px
2104
%else
2105
%ifidn %2, h
2106
    lea         dst_reg, [dst_reg + stride_reg*8-2]
2107
%else ; v
2108
    add         dst_reg, 8
2109
%endif
2110
    dec         cnt_reg
2111
    jg .next8px
2112
%endif
2113
%endif
2114

    
2115
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
2116
    mov             rsp, stack_reg   ; restore stack pointer
2117
%endif
2118
    RET
2119
%endmacro
2120

    
2121
INIT_MMX
2122
%define SPLATB_REG SPLATB_REG_MMX
2123
INNER_LOOPFILTER mmx,    v, 6, 16, 0
2124
INNER_LOOPFILTER mmx,    h, 6, 16, 0
2125
INNER_LOOPFILTER mmx,    v, 6,  8, 0
2126
INNER_LOOPFILTER mmx,    h, 6,  8, 0
2127

    
2128
%define SPLATB_REG SPLATB_REG_MMXEXT
2129
INNER_LOOPFILTER mmxext, v, 6, 16, 0
2130
INNER_LOOPFILTER mmxext, h, 6, 16, 0
2131
INNER_LOOPFILTER mmxext, v, 6,  8, 0
2132
INNER_LOOPFILTER mmxext, h, 6,  8, 0
2133

    
2134
INIT_XMM
2135
%define SPLATB_REG SPLATB_REG_SSE2
2136
INNER_LOOPFILTER sse2,   v, 5, 16, 13
2137
%ifdef m8
2138
INNER_LOOPFILTER sse2,   h, 5, 16, 13
2139
%else
2140
INNER_LOOPFILTER sse2,   h, 6, 16, 13
2141
%endif
2142
INNER_LOOPFILTER sse2,   v, 6,  8, 13
2143
INNER_LOOPFILTER sse2,   h, 6,  8, 13
2144

    
2145
%define SPLATB_REG SPLATB_REG_SSSE3
2146
INNER_LOOPFILTER ssse3,  v, 5, 16, 13
2147
%ifdef m8
2148
INNER_LOOPFILTER ssse3,  h, 5, 16, 13
2149
%else
2150
INNER_LOOPFILTER ssse3,  h, 6, 16, 13
2151
%endif
2152
INNER_LOOPFILTER ssse3,  v, 6,  8, 13
2153
INNER_LOOPFILTER ssse3,  h, 6,  8, 13
2154

    
2155
;-----------------------------------------------------------------------------
2156
; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
2157
;                                            int flimE, int flimI, int hev_thr);
2158
;-----------------------------------------------------------------------------
2159

    
2160
%macro MBEDGE_LOOPFILTER 5
2161
%if %4 == 8 ; chroma
2162
cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5
2163
%define dst8_reg    r1
2164
%define mstride_reg r2
2165
%define E_reg       r3
2166
%define I_reg       r4
2167
%define hev_thr_reg r5
2168
%else ; luma
2169
cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
2170
%define mstride_reg r1
2171
%define E_reg       r2
2172
%define I_reg       r3
2173
%define hev_thr_reg r4
2174
%ifdef m8 ; x86-64, sse2
2175
%define dst8_reg    r4
2176
%elif mmsize == 16 ; x86-32, sse2
2177
%define dst8_reg    r5
2178
%else ; x86-32, mmx/mmxext
2179
%define cnt_reg     r5
2180
%endif
2181
%endif
2182
%define dst_reg     r0
2183
%define stride_reg  E_reg
2184
%define dst2_reg    I_reg
2185
%ifndef m8
2186
%define stack_reg   hev_thr_reg
2187
%endif
2188

    
2189
%define ssse3_or_higher 0
2190
%ifnidn %1, sse2
2191
%if mmsize == 16
2192
%define ssse3_or_higher 1
2193
%endif
2194
%endif
2195

    
2196
%if ssse3_or_higher
2197
    pxor             m7, m7
2198
%endif
2199

    
2200
%ifndef m8 ; mmx/mmxext or sse2 on x86-32
2201
    ; splat function arguments
2202
    SPLATB_REG       m0, E_reg, m7   ; E
2203
    SPLATB_REG       m1, I_reg, m7   ; I
2204
    SPLATB_REG       m2, hev_thr_reg, m7 ; hev_thresh
2205

    
2206
    ; align stack
2207
    mov       stack_reg, rsp         ; backup stack pointer
2208
    and             rsp, ~(mmsize-1) ; align stack
2209
%if mmsize == 16
2210
    sub             rsp, mmsize * 7
2211
%else
2212
    sub             rsp, mmsize * 8  ; stack layout: [0]=E, [1]=I, [2]=hev_thr
2213
                                     ;               [3]=hev() result
2214
                                     ;               [4]=filter tmp result
2215
                                     ;               [5]/[6] = p2/q2 backup
2216
                                     ;               [7]=lim_res sign result
2217
%endif
2218

    
2219
%define flim_E   [rsp]
2220
%define flim_I   [rsp+mmsize]
2221
%define hev_thr  [rsp+mmsize*2]
2222
%define mask_res [rsp+mmsize*3]
2223
%define lim_res  [rsp+mmsize*4]
2224
%define p0backup [rsp+mmsize*3]
2225
%define q0backup [rsp+mmsize*4]
2226
%define p2backup [rsp+mmsize*5]
2227
%define q2backup [rsp+mmsize*6]
2228
%if mmsize == 16
2229
%define lim_sign [rsp]
2230
%else
2231
%define lim_sign [rsp+mmsize*7]
2232
%endif
2233

    
2234
    mova         flim_E, m0
2235
    mova         flim_I, m1
2236
    mova        hev_thr, m2
2237

    
2238
%else ; sse2 on x86-64
2239

    
2240
%define flim_E   m9
2241
%define flim_I   m10
2242
%define hev_thr  m11
2243
%define mask_res m12
2244
%define lim_res  m8
2245
%define p0backup m12
2246
%define q0backup m8
2247
%define p2backup m13
2248
%define q2backup m14
2249
%define lim_sign m9
2250

    
2251
    ; splat function arguments
2252
    SPLATB_REG   flim_E, E_reg, m7   ; E
2253
    SPLATB_REG   flim_I, I_reg, m7   ; I
2254
    SPLATB_REG  hev_thr, hev_thr_reg, m7 ; hev_thresh
2255
%endif
2256

    
2257
%if mmsize == 8 && %4 == 16 ; mmx/mmxext
2258
    mov         cnt_reg, 2
2259
%endif
2260
    mov      stride_reg, mstride_reg
2261
    neg     mstride_reg
2262
%ifidn %2, h
2263
    lea         dst_reg, [dst_reg + stride_reg*4-4]
2264
%if %4 == 8
2265
    lea        dst8_reg, [dst8_reg+ stride_reg*4-4]
2266
%endif
2267
%endif
2268

    
2269
%if mmsize == 8
2270
.next8px
2271
%endif
2272
    ; read
2273
    lea        dst2_reg, [dst_reg + stride_reg]
2274
%ifidn %2, v
2275
%if %4 == 8 && mmsize == 16
2276
%define movrow movh
2277
%else
2278
%define movrow mova
2279
%endif
2280
    movrow           m0, [dst_reg +mstride_reg*4] ; p3
2281
    movrow           m1, [dst2_reg+mstride_reg*4] ; p2
2282
    movrow           m2, [dst_reg +mstride_reg*2] ; p1
2283
    movrow           m5, [dst2_reg]               ; q1
2284
    movrow           m6, [dst2_reg+ stride_reg]   ; q2
2285
    movrow           m7, [dst2_reg+ stride_reg*2] ; q3
2286
%if mmsize == 16 && %4 == 8
2287
    movhps           m0, [dst8_reg+mstride_reg*4]
2288
    movhps           m2, [dst8_reg+mstride_reg*2]
2289
    add        dst8_reg, stride_reg
2290
    movhps           m1, [dst8_reg+mstride_reg*4]
2291
    movhps           m5, [dst8_reg]
2292
    movhps           m6, [dst8_reg+ stride_reg]
2293
    movhps           m7, [dst8_reg+ stride_reg*2]
2294
    add        dst8_reg, mstride_reg
2295
%endif
2296
%elif mmsize == 8 ; mmx/mmxext (h)
2297
    ; read 8 rows of 8px each
2298
    movu             m0, [dst_reg +mstride_reg*4]
2299
    movu             m1, [dst2_reg+mstride_reg*4]
2300
    movu             m2, [dst_reg +mstride_reg*2]
2301
    movu             m3, [dst_reg +mstride_reg]
2302
    movu             m4, [dst_reg]
2303
    movu             m5, [dst2_reg]
2304
    movu             m6, [dst2_reg+ stride_reg]
2305

    
2306
    ; 8x8 transpose
2307
    TRANSPOSE4x4B     0, 1, 2, 3, 7
2308
    mova       q0backup, m1
2309
    movu             m7, [dst2_reg+ stride_reg*2]
2310
    TRANSPOSE4x4B     4, 5, 6, 7, 1
2311
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
2312
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
2313
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
2314
    mova             m1, q0backup
2315
    mova       q0backup, m2          ; store q0
2316
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
2317
    mova       p0backup, m5          ; store p0
2318
    SWAP              1, 4
2319
    SWAP              2, 4
2320
    SWAP              6, 3
2321
    SWAP              5, 3
2322
%else ; sse2 (h)
2323
%if %4 == 16
2324
    lea        dst8_reg, [dst_reg + stride_reg*8]
2325
%endif
2326

    
2327
    ; read 16 rows of 8px each, interleave
2328
    movh             m0, [dst_reg +mstride_reg*4]
2329
    movh             m1, [dst8_reg+mstride_reg*4]
2330
    movh             m2, [dst_reg +mstride_reg*2]
2331
    movh             m5, [dst8_reg+mstride_reg*2]
2332
    movh             m3, [dst_reg +mstride_reg]
2333
    movh             m6, [dst8_reg+mstride_reg]
2334
    movh             m4, [dst_reg]
2335
    movh             m7, [dst8_reg]
2336
    punpcklbw        m0, m1          ; A/I
2337
    punpcklbw        m2, m5          ; C/K
2338
    punpcklbw        m3, m6          ; D/L
2339
    punpcklbw        m4, m7          ; E/M
2340

    
2341
    add        dst8_reg, stride_reg
2342
    movh             m1, [dst2_reg+mstride_reg*4]
2343
    movh             m6, [dst8_reg+mstride_reg*4]
2344
    movh             m5, [dst2_reg]
2345
    movh             m7, [dst8_reg]
2346
    punpcklbw        m1, m6          ; B/J
2347
    punpcklbw        m5, m7          ; F/N
2348
    movh             m6, [dst2_reg+ stride_reg]
2349
    movh             m7, [dst8_reg+ stride_reg]
2350
    punpcklbw        m6, m7          ; G/O
2351

    
2352
    ; 8x16 transpose
2353
    TRANSPOSE4x4B     0, 1, 2, 3, 7
2354
%ifdef m8
2355
    SWAP              1, 8
2356
%else
2357
    mova       q0backup, m1
2358
%endif
2359
    movh             m7, [dst2_reg+ stride_reg*2]
2360
    movh             m1, [dst8_reg+ stride_reg*2]
2361
    punpcklbw        m7, m1          ; H/P
2362
    TRANSPOSE4x4B     4, 5, 6, 7, 1
2363
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
2364
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
2365
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
2366
%ifdef m8
2367
    SWAP              1, 8
2368
    SWAP              2, 8
2369
%else
2370
    mova             m1, q0backup
2371
    mova       q0backup, m2          ; store q0
2372
%endif
2373
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
2374
%ifdef m12
2375
    SWAP              5, 12
2376
%else
2377
    mova       p0backup, m5          ; store p0
2378
%endif
2379
    SWAP              1, 4
2380
    SWAP              2, 4
2381
    SWAP              6, 3
2382
    SWAP              5, 3
2383
%endif
2384

    
2385
    ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
2386
    mova             m4, m1
2387
    SWAP              4, 1
2388
    psubusb          m4, m0          ; p2-p3
2389
    psubusb          m0, m1          ; p3-p2
2390
    por              m0, m4          ; abs(p3-p2)
2391

    
2392
    mova             m4, m2
2393
    SWAP              4, 2
2394
    psubusb          m4, m1          ; p1-p2
2395
    mova       p2backup, m1
2396
    psubusb          m1, m2          ; p2-p1
2397
    por              m1, m4          ; abs(p2-p1)
2398

    
2399
    mova             m4, m6
2400
    SWAP              4, 6
2401
    psubusb          m4, m7          ; q2-q3
2402
    psubusb          m7, m6          ; q3-q2
2403
    por              m7, m4          ; abs(q3-q2)
2404

    
2405
    mova             m4, m5
2406
    SWAP              4, 5
2407
    psubusb          m4, m6          ; q1-q2
2408
    mova       q2backup, m6
2409
    psubusb          m6, m5          ; q2-q1
2410
    por              m6, m4          ; abs(q2-q1)
2411

    
2412
%ifidn %1, mmx
2413
    mova             m4, flim_I
2414
    pxor             m3, m3
2415
    psubusb          m0, m4
2416
    psubusb          m1, m4
2417
    psubusb          m7, m4
2418
    psubusb          m6, m4
2419
    pcmpeqb          m0, m3          ; abs(p3-p2) <= I
2420
    pcmpeqb          m1, m3          ; abs(p2-p1) <= I
2421
    pcmpeqb          m7, m3          ; abs(q3-q2) <= I
2422
    pcmpeqb          m6, m3          ; abs(q2-q1) <= I
2423
    pand             m0, m1
2424
    pand             m7, m6
2425
    pand             m0, m7
2426
%else ; mmxext/sse2
2427
    pmaxub           m0, m1
2428
    pmaxub           m6, m7
2429
    pmaxub           m0, m6
2430
%endif
2431

    
2432
    ; normal_limit and high_edge_variance for p1-p0, q1-q0
2433
    SWAP              7, 3           ; now m7 is zero
2434
%ifidn %2, v
2435
    movrow           m3, [dst_reg +mstride_reg] ; p0
2436
%if mmsize == 16 && %4 == 8
2437
    movhps           m3, [dst8_reg+mstride_reg]
2438
%endif
2439
%elifdef m12
2440
    SWAP              3, 12
2441
%else
2442
    mova             m3, p0backup
2443
%endif
2444

    
2445
    mova             m1, m2
2446
    SWAP              1, 2
2447
    mova             m6, m3
2448
    SWAP              3, 6
2449
    psubusb          m1, m3          ; p1-p0
2450
    psubusb          m6, m2          ; p0-p1
2451
    por              m1, m6          ; abs(p1-p0)
2452
%ifidn %1, mmx
2453
    mova             m6, m1
2454
    psubusb          m1, m4
2455
    psubusb          m6, hev_thr
2456
    pcmpeqb          m1, m7          ; abs(p1-p0) <= I
2457
    pcmpeqb          m6, m7          ; abs(p1-p0) <= hev_thresh
2458
    pand             m0, m1
2459
    mova       mask_res, m6
2460
%else ; mmxext/sse2
2461
    pmaxub           m0, m1          ; max_I
2462
    SWAP              1, 4           ; max_hev_thresh
2463
%endif
2464

    
2465
    SWAP              6, 4           ; now m6 is I
2466
%ifidn %2, v
2467
    movrow           m4, [dst_reg]   ; q0
2468
%if mmsize == 16 && %4 == 8
2469
    movhps           m4, [dst8_reg]
2470
%endif
2471
%elifdef m8
2472
    SWAP              4, 8
2473
%else
2474
    mova             m4, q0backup
2475
%endif
2476
    mova             m1, m4
2477
    SWAP              1, 4
2478
    mova             m7, m5
2479
    SWAP              7, 5
2480
    psubusb          m1, m5          ; q0-q1
2481
    psubusb          m7, m4          ; q1-q0
2482
    por              m1, m7          ; abs(q1-q0)
2483
%ifidn %1, mmx
2484
    mova             m7, m1
2485
    psubusb          m1, m6
2486
    psubusb          m7, hev_thr
2487
    pxor             m6, m6
2488
    pcmpeqb          m1, m6          ; abs(q1-q0) <= I
2489
    pcmpeqb          m7, m6          ; abs(q1-q0) <= hev_thresh
2490
    mova             m6, mask_res
2491
    pand             m0, m1          ; abs([pq][321]-[pq][210]) <= I
2492
    pand             m6, m7
2493
%else ; mmxext/sse2
2494
    pxor             m7, m7
2495
    pmaxub           m0, m1
2496
    pmaxub           m6, m1
2497
    psubusb          m0, flim_I
2498
    psubusb          m6, hev_thr
2499
    pcmpeqb          m0, m7          ; max(abs(..)) <= I
2500
    pcmpeqb          m6, m7          ; !(max(abs..) > thresh)
2501
%endif
2502
%ifdef m12
2503
    SWAP              6, 12
2504
%else
2505
    mova       mask_res, m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
2506
%endif
2507

    
2508
    ; simple_limit
2509
    mova             m1, m3
2510
    SWAP              1, 3
2511
    mova             m6, m4          ; keep copies of p0/q0 around for later use
2512
    SWAP              6, 4
2513
    psubusb          m1, m4          ; p0-q0
2514
    psubusb          m6, m3          ; q0-p0
2515
    por              m1, m6          ; abs(q0-p0)
2516
    paddusb          m1, m1          ; m1=2*abs(q0-p0)
2517

    
2518
    mova             m7, m2
2519
    SWAP              7, 2
2520
    mova             m6, m5
2521
    SWAP              6, 5
2522
    psubusb          m7, m5          ; p1-q1
2523
    psubusb          m6, m2          ; q1-p1
2524
    por              m7, m6          ; abs(q1-p1)
2525
    pxor             m6, m6
2526
    pand             m7, [pb_FE]
2527
    psrlq            m7, 1           ; abs(q1-p1)/2
2528
    paddusb          m7, m1          ; abs(q0-p0)*2+abs(q1-p1)/2
2529
    psubusb          m7, flim_E
2530
    pcmpeqb          m7, m6          ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
2531
    pand             m0, m7          ; normal_limit result
2532

    
2533
    ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
2534
%ifdef m8 ; x86-64 && sse2
2535
    mova             m8, [pb_80]
2536
%define pb_80_var m8
2537
%else ; x86-32 or mmx/mmxext
2538
%define pb_80_var [pb_80]
2539
%endif
2540
    mova             m1, m4
2541
    mova             m7, m3
2542
    pxor             m1, pb_80_var
2543
    pxor             m7, pb_80_var
2544
    psubsb           m1, m7          ; (signed) q0-p0
2545
    mova             m6, m2
2546
    mova             m7, m5
2547
    pxor             m6, pb_80_var
2548
    pxor             m7, pb_80_var
2549
    psubsb           m6, m7          ; (signed) p1-q1
2550
    mova             m7, mask_res
2551
    paddsb           m6, m1
2552
    paddsb           m6, m1
2553
    paddsb           m6, m1
2554
    pand             m6, m0
2555
%ifdef m8
2556
    mova        lim_res, m6          ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
2557
    pand        lim_res, m7
2558
%else
2559
    mova             m0, m6
2560
    pand             m0, m7
2561
    mova        lim_res, m0
2562
%endif
2563
    pandn            m7, m6          ; 3*(q0-p0)+(p1-q1) masked for filter_common
2564

    
2565
    mova             m1, [pb_F8]
2566
    mova             m6, m7
2567
    paddsb           m7, [pb_3]
2568
    paddsb           m6, [pb_4]
2569
    pand             m7, m1
2570
    pand             m6, m1
2571

    
2572
    pxor             m1, m1
2573
    pxor             m0, m0
2574
    pcmpgtb          m1, m7
2575
    psubb            m0, m7
2576
    psrlq            m7, 3           ; +f2
2577
    psrlq            m0, 3           ; -f2
2578
    pand             m0, m1
2579
    pandn            m1, m7
2580
    psubusb          m3, m0
2581
    paddusb          m3, m1          ; p0+f2
2582

    
2583
    pxor             m1, m1
2584
    pxor             m0, m0
2585
    pcmpgtb          m0, m6
2586
    psubb            m1, m6
2587
    psrlq            m6, 3           ; +f1
2588
    psrlq            m1, 3           ; -f1
2589
    pand             m1, m0
2590
    pandn            m0, m6
2591
    psubusb          m4, m0
2592
    paddusb          m4, m1          ; q0-f1
2593

    
2594
    ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
2595
%if ssse3_or_higher
2596
    mova             m7, [pb_1]
2597
%else
2598
    mova             m7, [pw_63]
2599
%endif
2600
%ifdef m8
2601
    SWAP              1, 8
2602
%else
2603
    mova             m1, lim_res
2604
%endif
2605
    pxor             m0, m0
2606
    mova             m6, m1
2607
    pcmpgtb          m0, m1         ; which are negative
2608
%if ssse3_or_higher
2609
    punpcklbw        m6, m7         ; interleave with "1" for rounding
2610
    punpckhbw        m1, m7
2611
%else
2612
    punpcklbw        m6, m0         ; signed byte->word
2613
    punpckhbw        m1, m0
2614
%endif
2615
    mova       lim_sign, m0
2616
%if ssse3_or_higher
2617
    mova             m7, [pb_27_63]
2618
%ifndef m8
2619
    mova        lim_res, m1
2620
%endif
2621
%ifdef m10
2622
    SWAP              0, 10         ; don't lose lim_sign copy
2623
%endif
2624
    mova             m0, m7
2625
    pmaddubsw        m7, m6
2626
    SWAP              6, 7
2627
    pmaddubsw        m0, m1
2628
    SWAP              1, 0
2629
%ifdef m10
2630
    SWAP              0, 10
2631
%else
2632
    mova             m0, lim_sign
2633
%endif
2634
%else
2635
    mova       mask_res, m6         ; backup for later in filter
2636
    mova        lim_res, m1
2637
    pmullw          m6, [pw_27]
2638
    pmullw          m1, [pw_27]
2639
    paddw           m6, m7
2640
    paddw           m1, m7
2641
%endif
2642
    psraw           m6, 7
2643
    psraw           m1, 7
2644
    packsswb        m6, m1          ; a0
2645
    pxor            m1, m1
2646
    psubb           m1, m6
2647
    pand            m1, m0          ; -a0
2648
    pandn           m0, m6          ; +a0
2649
%if ssse3_or_higher
2650
    mova            m6, [pb_18_63]  ; pipelining
2651
%endif
2652
    psubusb         m3, m1
2653
    paddusb         m4, m1
2654
    paddusb         m3, m0          ; p0+a0
2655
    psubusb         m4, m0          ; q0-a0
2656

    
2657
%if ssse3_or_higher
2658
    SWAP             6, 7
2659
%ifdef m10
2660
    SWAP             1, 10
2661
%else
2662
    mova            m1, lim_res
2663
%endif
2664
    mova            m0, m7
2665
    pmaddubsw       m7, m6
2666
    SWAP             6, 7
2667
    pmaddubsw       m0, m1
2668
    SWAP             1, 0
2669
%ifdef m10
2670
    SWAP             0, 10
2671
%endif
2672
    mova            m0, lim_sign
2673
%else
2674
    mova            m6, mask_res
2675
    mova            m1, lim_res
2676
    pmullw          m6, [pw_18]
2677
    pmullw          m1, [pw_18]
2678
    paddw           m6, m7
2679
    paddw           m1, m7
2680
%endif
2681
    mova            m0, lim_sign
2682
    psraw           m6, 7
2683
    psraw           m1, 7
2684
    packsswb        m6, m1          ; a1
2685
    pxor            m1, m1
2686
    psubb           m1, m6
2687
    pand            m1, m0          ; -a1
2688
    pandn           m0, m6          ; +a1
2689
%if ssse3_or_higher
2690
    mova            m6, [pb_9_63]
2691
%endif
2692
    psubusb         m2, m1
2693
    paddusb         m5, m1
2694
    paddusb         m2, m0          ; p1+a1
2695
    psubusb         m5, m0          ; q1-a1
2696

    
2697
%if ssse3_or_higher
2698
    SWAP             6, 7
2699
%ifdef m10
2700
    SWAP             1, 10
2701
%else
2702
    mova            m1, lim_res
2703
%endif
2704
    mova            m0, m7
2705
    pmaddubsw       m7, m6
2706
    SWAP             6, 7
2707
    pmaddubsw       m0, m1
2708
    SWAP             1, 0
2709
%else
2710
%ifdef m8
2711
    SWAP             6, 12
2712
    SWAP             1, 8
2713
%else
2714
    mova            m6, mask_res
2715
    mova            m1, lim_res
2716
%endif
2717
    pmullw          m6, [pw_9]
2718
    pmullw          m1, [pw_9]
2719
    paddw           m6, m7
2720
    paddw           m1, m7
2721
%endif
2722
%ifdef m9
2723
    SWAP             7, 9
2724
%else
2725
    mova            m7, lim_sign
2726
%endif
2727
    psraw           m6, 7
2728
    psraw           m1, 7
2729
    packsswb        m6, m1          ; a1
2730
    pxor            m0, m0
2731
    psubb           m0, m6
2732
    pand            m0, m7          ; -a1
2733
    pandn           m7, m6          ; +a1
2734
%ifdef m8
2735
    SWAP             1, 13
2736
    SWAP             6, 14
2737
%else
2738
    mova            m1, p2backup
2739
    mova            m6, q2backup
2740
%endif
2741
    psubusb         m1, m0
2742
    paddusb         m6, m0
2743
    paddusb         m1, m7          ; p1+a1
2744
    psubusb         m6, m7          ; q1-a1
2745

    
2746
    ; store
2747
%ifidn %2, v
2748
    movrow [dst2_reg+mstride_reg*4], m1
2749
    movrow [dst_reg +mstride_reg*2], m2
2750
    movrow [dst_reg +mstride_reg  ], m3
2751
    movrow    [dst_reg], m4
2752
    movrow   [dst2_reg], m5
2753
    movrow [dst2_reg+ stride_reg  ], m6
2754
%if mmsize == 16 && %4 == 8
2755
    add        dst8_reg, mstride_reg
2756
    movhps [dst8_reg+mstride_reg*2], m1
2757
    movhps [dst8_reg+mstride_reg  ], m2
2758
    movhps   [dst8_reg], m3
2759
    add        dst8_reg, stride_reg
2760
    movhps   [dst8_reg], m4
2761
    movhps [dst8_reg+ stride_reg  ], m5
2762
    movhps [dst8_reg+ stride_reg*2], m6
2763
%endif
2764
%else ; h
2765
    inc         dst_reg
2766
    inc        dst2_reg
2767

    
2768
    ; 4x8/16 transpose
2769
    TRANSPOSE4x4B     1, 2, 3, 4, 0
2770
    SBUTTERFLY       bw, 5, 6, 0
2771

    
2772
%if mmsize == 8 ; mmx/mmxext (h)
2773
    WRITE_4x2D        1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg
2774
    add         dst_reg, 4
2775
    WRITE_2x4W       m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg
2776
%else ; sse2 (h)
2777
    lea        dst8_reg, [dst8_reg+mstride_reg+1]
2778
    WRITE_4x4D        1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
2779
    lea         dst_reg, [dst2_reg+mstride_reg+4]
2780
    lea        dst8_reg, [dst8_reg+mstride_reg+4]
2781
%ifidn %1, sse4
2782
    add        dst2_reg, 4
2783
%endif
2784
    WRITE_8W         m5, dst2_reg, dst_reg,  mstride_reg, stride_reg
2785
%ifidn %1, sse4
2786
    lea        dst2_reg, [dst8_reg+ stride_reg]
2787
%endif
2788
    WRITE_8W         m6, dst2_reg, dst8_reg, mstride_reg, stride_reg
2789
%endif
2790
%endif
2791

    
2792
%if mmsize == 8
2793
%if %4 == 8 ; chroma
2794
%ifidn %2, h
2795
    sub         dst_reg, 5
2796
%endif
2797
    cmp         dst_reg, dst8_reg
2798
    mov         dst_reg, dst8_reg
2799
    jnz .next8px
2800
%else
2801
%ifidn %2, h
2802
    lea         dst_reg, [dst_reg + stride_reg*8-5]
2803
%else ; v
2804
    add         dst_reg, 8
2805
%endif
2806
    dec         cnt_reg
2807
    jg .next8px
2808
%endif
2809
%endif
2810

    
2811
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
2812
    mov             rsp, stack_reg   ; restore stack pointer
2813
%endif
2814
    RET
2815
%endmacro
2816

    
2817
INIT_MMX
2818
%define SPLATB_REG SPLATB_REG_MMX
2819
MBEDGE_LOOPFILTER mmx,    v, 6, 16, 0
2820
MBEDGE_LOOPFILTER mmx,    h, 6, 16, 0
2821
MBEDGE_LOOPFILTER mmx,    v, 6,  8, 0
2822
MBEDGE_LOOPFILTER mmx,    h, 6,  8, 0
2823

    
2824
%define SPLATB_REG SPLATB_REG_MMXEXT
2825
MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0
2826
MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0
2827
MBEDGE_LOOPFILTER mmxext, v, 6,  8, 0
2828
MBEDGE_LOOPFILTER mmxext, h, 6,  8, 0
2829

    
2830
INIT_XMM
2831
%define SPLATB_REG SPLATB_REG_SSE2
2832
%define WRITE_8W   WRITE_8W_SSE2
2833
MBEDGE_LOOPFILTER sse2,   v, 5, 16, 15
2834
%ifdef m8
2835
MBEDGE_LOOPFILTER sse2,   h, 5, 16, 15
2836
%else
2837
MBEDGE_LOOPFILTER sse2,   h, 6, 16, 15
2838
%endif
2839
MBEDGE_LOOPFILTER sse2,   v, 6,  8, 15
2840
MBEDGE_LOOPFILTER sse2,   h, 6,  8, 15
2841

    
2842
%define SPLATB_REG SPLATB_REG_SSSE3
2843
MBEDGE_LOOPFILTER ssse3,  v, 5, 16, 15
2844
%ifdef m8
2845
MBEDGE_LOOPFILTER ssse3,  h, 5, 16, 15
2846
%else
2847
MBEDGE_LOOPFILTER ssse3,  h, 6, 16, 15
2848
%endif
2849
MBEDGE_LOOPFILTER ssse3,  v, 6,  8, 15
2850
MBEDGE_LOOPFILTER ssse3,  h, 6,  8, 15
2851

    
2852
%define WRITE_8W   WRITE_8W_SSE4
2853
%ifdef m8
2854
MBEDGE_LOOPFILTER sse4,   h, 5, 16, 15
2855
%else
2856
MBEDGE_LOOPFILTER sse4,   h, 6, 16, 15
2857
%endif
2858
MBEDGE_LOOPFILTER sse4,   h, 6,  8, 15