Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / vp8dsp.asm @ f2a30bd8

History | View | Annotate | Download (37.9 KB)

1
;******************************************************************************
2
;* VP8 MMXEXT optimizations
3
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5
;*
6
;* This file is part of FFmpeg.
7
;*
8
;* FFmpeg is free software; you can redistribute it and/or
9
;* modify it under the terms of the GNU Lesser General Public
10
;* License as published by the Free Software Foundation; either
11
;* version 2.1 of the License, or (at your option) any later version.
12
;*
13
;* FFmpeg is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
;* Lesser General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU Lesser General Public
19
;* License along with FFmpeg; if not, write to the Free Software
20
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
;******************************************************************************
22

    
23
%include "x86inc.asm"
24
%include "x86util.asm"
25

    
26
SECTION_RODATA
27

    
28
fourtap_filter_hw_m: times 4 dw  -6, 123
29
                     times 4 dw  12,  -1
30
                     times 4 dw  -9,  93
31
                     times 4 dw  50,  -6
32
                     times 4 dw  -6,  50
33
                     times 4 dw  93,  -9
34
                     times 4 dw  -1,  12
35
                     times 4 dw 123,  -6
36

    
37
sixtap_filter_hw_m:  times 4 dw   2, -11
38
                     times 4 dw 108,  36
39
                     times 4 dw  -8,   1
40
                     times 4 dw   3, -16
41
                     times 4 dw  77,  77
42
                     times 4 dw -16,   3
43
                     times 4 dw   1,  -8
44
                     times 4 dw  36, 108
45
                     times 4 dw -11,   2
46

    
47
fourtap_filter_hb_m: times 8 db  -6, 123
48
                     times 8 db  12,  -1
49
                     times 8 db  -9,  93
50
                     times 8 db  50,  -6
51
                     times 8 db  -6,  50
52
                     times 8 db  93,  -9
53
                     times 8 db  -1,  12
54
                     times 8 db 123,  -6
55

    
56
sixtap_filter_hb_m:  times 8 db   2,   1
57
                     times 8 db -11, 108
58
                     times 8 db  36,  -8
59
                     times 8 db   3,   3
60
                     times 8 db -16,  77
61
                     times 8 db  77, -16
62
                     times 8 db   1,   2
63
                     times 8 db  -8,  36
64
                     times 8 db 108, -11
65

    
66
fourtap_filter_v_m:  times 8 dw  -6
67
                     times 8 dw 123
68
                     times 8 dw  12
69
                     times 8 dw  -1
70
                     times 8 dw  -9
71
                     times 8 dw  93
72
                     times 8 dw  50
73
                     times 8 dw  -6
74
                     times 8 dw  -6
75
                     times 8 dw  50
76
                     times 8 dw  93
77
                     times 8 dw  -9
78
                     times 8 dw  -1
79
                     times 8 dw  12
80
                     times 8 dw 123
81
                     times 8 dw  -6
82

    
83
sixtap_filter_v_m:   times 8 dw   2
84
                     times 8 dw -11
85
                     times 8 dw 108
86
                     times 8 dw  36
87
                     times 8 dw  -8
88
                     times 8 dw   1
89
                     times 8 dw   3
90
                     times 8 dw -16
91
                     times 8 dw  77
92
                     times 8 dw  77
93
                     times 8 dw -16
94
                     times 8 dw   3
95
                     times 8 dw   1
96
                     times 8 dw  -8
97
                     times 8 dw  36
98
                     times 8 dw 108
99
                     times 8 dw -11
100
                     times 8 dw   2
101

    
102
bilinear_filter_vw_m: times 8 dw 1
103
                      times 8 dw 2
104
                      times 8 dw 3
105
                      times 8 dw 4
106
                      times 8 dw 5
107
                      times 8 dw 6
108
                      times 8 dw 7
109

    
110
bilinear_filter_vb_m: times 8 db 7, 1
111
                      times 8 db 6, 2
112
                      times 8 db 5, 3
113
                      times 8 db 4, 4
114
                      times 8 db 3, 5
115
                      times 8 db 2, 6
116
                      times 8 db 1, 7
117

    
118
%ifdef PIC
119
%define fourtap_filter_hw    r11
120
%define sixtap_filter_hw     r11
121
%define fourtap_filter_hb    r11
122
%define sixtap_filter_hb     r11
123
%define fourtap_filter_v     r11
124
%define sixtap_filter_v      r11
125
%define bilinear_filter_vw   r11
126
%define bilinear_filter_vb   r11
127
%else
128
%define fourtap_filter_hw fourtap_filter_hw_m
129
%define sixtap_filter_hw  sixtap_filter_hw_m
130
%define fourtap_filter_hb fourtap_filter_hb_m
131
%define sixtap_filter_hb  sixtap_filter_hb_m
132
%define fourtap_filter_v  fourtap_filter_v_m
133
%define sixtap_filter_v   sixtap_filter_v_m
134
%define bilinear_filter_vw bilinear_filter_vw_m
135
%define bilinear_filter_vb bilinear_filter_vb_m
136
%endif
137

    
138
filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
139
filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
140

    
141
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
142
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
143
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
144

    
145
pw_20091: times 4 dw 20091
146
pw_17734: times 4 dw 17734
147

    
148
cextern pw_3
149
cextern pb_3
150
cextern pw_4
151
cextern pb_4
152
cextern pw_64
153
cextern pb_80
154
cextern pb_F8
155
cextern pb_FE
156

    
157
SECTION .text
158

    
159
;-----------------------------------------------------------------------------
160
; subpel MC functions:
161
;
162
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
163
;                                              uint8_t *src, int srcstride,
164
;                                              int height,   int mx, int my);
165
;-----------------------------------------------------------------------------
166

    
167
%macro FILTER_SSSE3 3
168
cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
169
    lea      r5d, [r5*3]
170
    mova      m3, [filter_h6_shuf2]
171
    mova      m4, [filter_h6_shuf3]
172
%ifdef PIC
173
    lea      r11, [sixtap_filter_hb_m]
174
%endif
175
    mova      m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
176
    mova      m6, [sixtap_filter_hb+r5*8-32]
177
    mova      m7, [sixtap_filter_hb+r5*8-16]
178

    
179
.nextrow
180
    movu      m0, [r2-2]
181
    mova      m1, m0
182
    mova      m2, m0
183
%ifidn %1, 4
184
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
185
; shuffle with a memory operand
186
    punpcklbw m0, [r2+3]
187
%else
188
    pshufb    m0, [filter_h6_shuf1]
189
%endif
190
    pshufb    m1, m3
191
    pshufb    m2, m4
192
    pmaddubsw m0, m5
193
    pmaddubsw m1, m6
194
    pmaddubsw m2, m7
195
    paddsw    m0, m1
196
    paddsw    m0, m2
197
    paddsw    m0, [pw_64]
198
    psraw     m0, 7
199
    packuswb  m0, m0
200
    movh    [r0], m0        ; store
201

    
202
    ; go to next line
203
    add       r0, r1
204
    add       r2, r3
205
    dec       r4            ; next row
206
    jg .nextrow
207
    REP_RET
208

    
209
cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
210
    shl      r5d, 4
211
    mova      m2, [pw_64]
212
    mova      m3, [filter_h2_shuf]
213
    mova      m4, [filter_h4_shuf]
214
%ifdef PIC
215
    lea      r11, [fourtap_filter_hb_m]
216
%endif
217
    mova      m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
218
    mova      m6, [fourtap_filter_hb+r5]
219

    
220
.nextrow
221
    movu      m0, [r2-1]
222
    mova      m1, m0
223
    pshufb    m0, m3
224
    pshufb    m1, m4
225
    pmaddubsw m0, m5
226
    pmaddubsw m1, m6
227
    paddsw    m0, m2
228
    paddsw    m0, m1
229
    psraw     m0, 7
230
    packuswb  m0, m0
231
    movh    [r0], m0        ; store
232

    
233
    ; go to next line
234
    add       r0, r1
235
    add       r2, r3
236
    dec       r4            ; next row
237
    jg .nextrow
238
    REP_RET
239

    
240
cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
241
    shl      r6d, 4
242
%ifdef PIC
243
    lea      r11, [fourtap_filter_hb_m]
244
%endif
245
    mova      m5, [fourtap_filter_hb+r6-16]
246
    mova      m6, [fourtap_filter_hb+r6]
247
    mova      m7, [pw_64]
248

    
249
    ; read 3 lines
250
    sub       r2, r3
251
    movh      m0, [r2]
252
    movh      m1, [r2+  r3]
253
    movh      m2, [r2+2*r3]
254
    add       r2, r3
255

    
256
.nextrow
257
    movh      m3, [r2+2*r3]                ; read new row
258
    mova      m4, m0
259
    mova      m0, m1
260
    punpcklbw m4, m1
261
    mova      m1, m2
262
    punpcklbw m2, m3
263
    pmaddubsw m4, m5
264
    pmaddubsw m2, m6
265
    paddsw    m4, m2
266
    mova      m2, m3
267
    paddsw    m4, m7
268
    psraw     m4, 7
269
    packuswb  m4, m4
270
    movh    [r0], m4
271

    
272
    ; go to next line
273
    add        r0, r1
274
    add        r2, r3
275
    dec        r4                          ; next row
276
    jg .nextrow
277
    REP_RET
278

    
279
cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
280
    lea      r6d, [r6*3]
281
%ifdef PIC
282
    lea      r11, [sixtap_filter_hb_m]
283
%endif
284
    lea       r6, [sixtap_filter_hb+r6*8]
285

    
286
    ; read 5 lines
287
    sub       r2, r3
288
    sub       r2, r3
289
    movh      m0, [r2]
290
    movh      m1, [r2+r3]
291
    movh      m2, [r2+r3*2]
292
    lea       r2, [r2+r3*2]
293
    add       r2, r3
294
    movh      m3, [r2]
295
    movh      m4, [r2+r3]
296

    
297
.nextrow
298
    movh      m5, [r2+2*r3]                ; read new row
299
    mova      m6, m0
300
    punpcklbw m6, m5
301
    mova      m0, m1
302
    punpcklbw m1, m2
303
    mova      m7, m3
304
    punpcklbw m7, m4
305
    pmaddubsw m6, [r6-48]
306
    pmaddubsw m1, [r6-32]
307
    pmaddubsw m7, [r6-16]
308
    paddsw    m6, m1
309
    paddsw    m6, m7
310
    mova      m1, m2
311
    paddsw    m6, [pw_64]
312
    mova      m2, m3
313
    psraw     m6, 7
314
    mova      m3, m4
315
    packuswb  m6, m6
316
    mova      m4, m5
317
    movh    [r0], m6
318

    
319
    ; go to next line
320
    add        r0, r1
321
    add        r2, r3
322
    dec        r4                          ; next row
323
    jg .nextrow
324
    REP_RET
325
%endmacro
326

    
327
INIT_MMX
328
FILTER_SSSE3 4, 0, 0
329
INIT_XMM
330
FILTER_SSSE3 8, 8, 7
331

    
332
; 4x4 block, H-only 4-tap filter
333
cglobal put_vp8_epel4_h4_mmxext, 6, 6
334
    shl       r5d, 4
335
%ifdef PIC
336
    lea       r11, [fourtap_filter_hw_m]
337
%endif
338
    movq      mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
339
    movq      mm5, [fourtap_filter_hw+r5]
340
    movq      mm7, [pw_64]
341
    pxor      mm6, mm6
342

    
343
.nextrow
344
    movq      mm1, [r2-1]                  ; (ABCDEFGH) load 8 horizontal pixels
345

    
346
    ; first set of 2 pixels
347
    movq      mm2, mm1                     ; byte ABCD..
348
    punpcklbw mm1, mm6                     ; byte->word ABCD
349
    pshufw    mm0, mm2, 9                  ; byte CDEF..
350
    punpcklbw mm0, mm6                     ; byte->word CDEF
351
    pshufw    mm3, mm1, 0x94               ; word ABBC
352
    pshufw    mm1, mm0, 0x94               ; word CDDE
353
    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
354
    movq      mm0, mm1                     ; backup for second set of pixels
355
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
356
    paddd     mm3, mm1                     ; finish 1st 2px
357

    
358
    ; second set of 2 pixels, use backup of above
359
    punpckhbw mm2, mm6                     ; byte->word EFGH
360
    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
361
    pshufw    mm1, mm2, 0x94               ; word EFFG
362
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
363
    paddd     mm0, mm1                     ; finish 2nd 2px
364

    
365
    ; merge two sets of 2 pixels into one set of 4, round/clip/store
366
    packssdw  mm3, mm0                     ; merge dword->word (4px)
367
    paddsw    mm3, mm7                     ; rounding
368
    psraw     mm3, 7
369
    packuswb  mm3, mm6                     ; clip and word->bytes
370
    movd     [r0], mm3                     ; store
371

    
372
    ; go to next line
373
    add        r0, r1
374
    add        r2, r3
375
    dec        r4                          ; next row
376
    jg .nextrow
377
    REP_RET
378

    
379
; 4x4 block, H-only 6-tap filter
380
cglobal put_vp8_epel4_h6_mmxext, 6, 6
381
    lea       r5d, [r5*3]
382
%ifdef PIC
383
    lea       r11, [sixtap_filter_hw_m]
384
%endif
385
    movq      mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
386
    movq      mm5, [sixtap_filter_hw+r5*8-32]
387
    movq      mm6, [sixtap_filter_hw+r5*8-16]
388
    movq      mm7, [pw_64]
389
    pxor      mm3, mm3
390

    
391
.nextrow
392
    movq      mm1, [r2-2]                  ; (ABCDEFGH) load 8 horizontal pixels
393

    
394
    ; first set of 2 pixels
395
    movq      mm2, mm1                     ; byte ABCD..
396
    punpcklbw mm1, mm3                     ; byte->word ABCD
397
    pshufw    mm0, mm2, 0x9                ; byte CDEF..
398
    punpckhbw mm2, mm3                     ; byte->word EFGH
399
    punpcklbw mm0, mm3                     ; byte->word CDEF
400
    pshufw    mm1, mm1, 0x94               ; word ABBC
401
    pshufw    mm2, mm2, 0x94               ; word EFFG
402
    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
403
    pshufw    mm3, mm0, 0x94               ; word CDDE
404
    movq      mm0, mm3                     ; backup for second set of pixels
405
    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
406
    paddd     mm1, mm3                     ; add to 1st 2px cache
407
    movq      mm3, mm2                     ; backup for second set of pixels
408
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
409
    paddd     mm1, mm2                     ; finish 1st 2px
410

    
411
    ; second set of 2 pixels, use backup of above
412
    movd      mm2, [r2+3]                  ; byte FGHI (prevent overreads)
413
    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
414
    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
415
    paddd     mm0, mm3                     ; add to 2nd 2px cache
416
    pxor      mm3, mm3
417
    punpcklbw mm2, mm3                     ; byte->word FGHI
418
    pshufw    mm2, mm2, 0xE9               ; word GHHI
419
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
420
    paddd     mm0, mm2                     ; finish 2nd 2px
421

    
422
    ; merge two sets of 2 pixels into one set of 4, round/clip/store
423
    packssdw  mm1, mm0                     ; merge dword->word (4px)
424
    paddsw    mm1, mm7                     ; rounding
425
    psraw     mm1, 7
426
    packuswb  mm1, mm3                     ; clip and word->bytes
427
    movd     [r0], mm1                     ; store
428

    
429
    ; go to next line
430
    add        r0, r1
431
    add        r2, r3
432
    dec        r4                          ; next row
433
    jg .nextrow
434
    REP_RET
435

    
436
; 4x4 block, H-only 4-tap filter
437
INIT_XMM
438
cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
439
    shl      r5d, 4
440
%ifdef PIC
441
    lea      r11, [fourtap_filter_hw_m]
442
%endif
443
    mova      m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
444
    mova      m6, [fourtap_filter_hw+r5]
445
    pxor      m7, m7
446

    
447
.nextrow
448
    movh      m0, [r2-1]
449
    punpcklbw m0, m7        ; ABCDEFGH
450
    mova      m1, m0
451
    mova      m2, m0
452
    mova      m3, m0
453
    psrldq    m1, 2         ; BCDEFGH
454
    psrldq    m2, 4         ; CDEFGH
455
    psrldq    m3, 6         ; DEFGH
456
    punpcklwd m0, m1        ; ABBCCDDE
457
    punpcklwd m2, m3        ; CDDEEFFG
458
    pmaddwd   m0, m5
459
    pmaddwd   m2, m6
460
    paddd     m0, m2
461

    
462
    movh      m1, [r2+3]
463
    punpcklbw m1, m7        ; ABCDEFGH
464
    mova      m2, m1
465
    mova      m3, m1
466
    mova      m4, m1
467
    psrldq    m2, 2         ; BCDEFGH
468
    psrldq    m3, 4         ; CDEFGH
469
    psrldq    m4, 6         ; DEFGH
470
    punpcklwd m1, m2        ; ABBCCDDE
471
    punpcklwd m3, m4        ; CDDEEFFG
472
    pmaddwd   m1, m5
473
    pmaddwd   m3, m6
474
    paddd     m1, m3
475

    
476
    packssdw  m0, m1
477
    paddsw    m0, [pw_64]
478
    psraw     m0, 7
479
    packuswb  m0, m7
480
    movh    [r0], m0        ; store
481

    
482
    ; go to next line
483
    add       r0, r1
484
    add       r2, r3
485
    dec       r4            ; next row
486
    jg .nextrow
487
    REP_RET
488

    
489
cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
490
    lea      r5d, [r5*3]
491
%ifdef PIC
492
    lea      r11, [sixtap_filter_hw_m]
493
%endif
494
    lea       r5, [sixtap_filter_hw+r5*8]
495
    pxor      m7, m7
496

    
497
.nextrow
498
    movu      m0, [r2-2]
499
    mova      m6, m0
500
    mova      m4, m0
501
    punpcklbw m0, m7        ; ABCDEFGHI
502
    mova      m1, m0
503
    mova      m2, m0
504
    mova      m3, m0
505
    psrldq    m1, 2         ; BCDEFGH
506
    psrldq    m2, 4         ; CDEFGH
507
    psrldq    m3, 6         ; DEFGH
508
    psrldq    m4, 4
509
    punpcklbw m4, m7        ; EFGH
510
    mova      m5, m4
511
    psrldq    m5, 2         ; FGH
512
    punpcklwd m0, m1        ; ABBCCDDE
513
    punpcklwd m2, m3        ; CDDEEFFG
514
    punpcklwd m4, m5        ; EFFGGHHI
515
    pmaddwd   m0, [r5-48]
516
    pmaddwd   m2, [r5-32]
517
    pmaddwd   m4, [r5-16]
518
    paddd     m0, m2
519
    paddd     m0, m4
520

    
521
    psrldq    m6, 4
522
    mova      m4, m6
523
    punpcklbw m6, m7        ; ABCDEFGHI
524
    mova      m1, m6
525
    mova      m2, m6
526
    mova      m3, m6
527
    psrldq    m1, 2         ; BCDEFGH
528
    psrldq    m2, 4         ; CDEFGH
529
    psrldq    m3, 6         ; DEFGH
530
    psrldq    m4, 4
531
    punpcklbw m4, m7        ; EFGH
532
    mova      m5, m4
533
    psrldq    m5, 2         ; FGH
534
    punpcklwd m6, m1        ; ABBCCDDE
535
    punpcklwd m2, m3        ; CDDEEFFG
536
    punpcklwd m4, m5        ; EFFGGHHI
537
    pmaddwd   m6, [r5-48]
538
    pmaddwd   m2, [r5-32]
539
    pmaddwd   m4, [r5-16]
540
    paddd     m6, m2
541
    paddd     m6, m4
542

    
543
    packssdw  m0, m6
544
    paddsw    m0, [pw_64]
545
    psraw     m0, 7
546
    packuswb  m0, m7
547
    movh    [r0], m0        ; store
548

    
549
    ; go to next line
550
    add       r0, r1
551
    add       r2, r3
552
    dec       r4            ; next row
553
    jg .nextrow
554
    REP_RET
555

    
556
%macro FILTER_V 3
557
; 4x4 block, V-only 4-tap filter
558
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
559
    shl      r6d, 5
560
%ifdef PIC
561
    lea      r11, [fourtap_filter_v_m]
562
%endif
563
    lea       r6, [fourtap_filter_v+r6-32]
564
    mova      m6, [pw_64]
565
    pxor      m7, m7
566
    mova      m5, [r6+48]
567

    
568
    ; read 3 lines
569
    sub       r2, r3
570
    movh      m0, [r2]
571
    movh      m1, [r2+  r3]
572
    movh      m2, [r2+2*r3]
573
    add       r2, r3
574
    punpcklbw m0, m7
575
    punpcklbw m1, m7
576
    punpcklbw m2, m7
577

    
578
.nextrow
579
    ; first calculate negative taps (to prevent losing positive overflows)
580
    movh      m4, [r2+2*r3]                ; read new row
581
    punpcklbw m4, m7
582
    mova      m3, m4
583
    pmullw    m0, [r6+0]
584
    pmullw    m4, m5
585
    paddsw    m4, m0
586

    
587
    ; then calculate positive taps
588
    mova      m0, m1
589
    pmullw    m1, [r6+16]
590
    paddsw    m4, m1
591
    mova      m1, m2
592
    pmullw    m2, [r6+32]
593
    paddsw    m4, m2
594
    mova      m2, m3
595

    
596
    ; round/clip/store
597
    paddsw    m4, m6
598
    psraw     m4, 7
599
    packuswb  m4, m7
600
    movh    [r0], m4
601

    
602
    ; go to next line
603
    add       r0, r1
604
    add       r2, r3
605
    dec       r4                           ; next row
606
    jg .nextrow
607
    REP_RET
608

    
609

    
610
; 4x4 block, V-only 6-tap filter
611
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
612
    shl      r6d, 4
613
    lea       r6, [r6*3]
614
%ifdef PIC
615
    lea      r11, [sixtap_filter_v_m]
616
%endif
617
    lea       r6, [sixtap_filter_v+r6-96]
618
    pxor      m7, m7
619

    
620
    ; read 5 lines
621
    sub       r2, r3
622
    sub       r2, r3
623
    movh      m0, [r2]
624
    movh      m1, [r2+r3]
625
    movh      m2, [r2+r3*2]
626
    lea       r2, [r2+r3*2]
627
    add       r2, r3
628
    movh      m3, [r2]
629
    movh      m4, [r2+r3]
630
    punpcklbw m0, m7
631
    punpcklbw m1, m7
632
    punpcklbw m2, m7
633
    punpcklbw m3, m7
634
    punpcklbw m4, m7
635

    
636
.nextrow
637
    ; first calculate negative taps (to prevent losing positive overflows)
638
    mova      m5, m1
639
    pmullw    m5, [r6+16]
640
    mova      m6, m4
641
    pmullw    m6, [r6+64]
642
    paddsw    m6, m5
643

    
644
    ; then calculate positive taps
645
    movh      m5, [r2+2*r3]                ; read new row
646
    punpcklbw m5, m7
647
    pmullw    m0, [r6+0]
648
    paddsw    m6, m0
649
    mova      m0, m1
650
    mova      m1, m2
651
    pmullw    m2, [r6+32]
652
    paddsw    m6, m2
653
    mova      m2, m3
654
    pmullw    m3, [r6+48]
655
    paddsw    m6, m3
656
    mova      m3, m4
657
    mova      m4, m5
658
    pmullw    m5, [r6+80]
659
    paddsw    m6, m5
660

    
661
    ; round/clip/store
662
    paddsw    m6, [pw_64]
663
    psraw     m6, 7
664
    packuswb  m6, m7
665
    movh    [r0], m6
666

    
667
    ; go to next line
668
    add       r0, r1
669
    add       r2, r3
670
    dec       r4                           ; next row
671
    jg .nextrow
672
    REP_RET
673
%endmacro
674

    
675
INIT_MMX
676
FILTER_V mmxext, 4, 0
677
INIT_XMM
678
FILTER_V sse2,   8, 8
679

    
680
%macro FILTER_BILINEAR 3
681
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
682
    mov      r5d, 8*16
683
    shl      r6d, 4
684
    sub      r5d, r6d
685
%ifdef PIC
686
    lea      r11, [bilinear_filter_vw_m]
687
%endif
688
    pxor      m6, m6
689
    mova      m4, [bilinear_filter_vw+r5-16]
690
    mova      m5, [bilinear_filter_vw+r6-16]
691
.nextrow
692
    movh      m0, [r2+r3*0]
693
    movh      m1, [r2+r3*1]
694
    movh      m3, [r2+r3*2]
695
    punpcklbw m0, m6
696
    punpcklbw m1, m6
697
    punpcklbw m3, m6
698
    mova      m2, m1
699
    pmullw    m0, m4
700
    pmullw    m1, m5
701
    pmullw    m2, m4
702
    pmullw    m3, m5
703
    paddsw    m0, m1
704
    paddsw    m2, m3
705
    psraw     m0, 2
706
    psraw     m2, 2
707
    pavgw     m0, m6
708
    pavgw     m2, m6
709
%ifidn %1, mmxext
710
    packuswb  m0, m0
711
    packuswb  m2, m2
712
    movh [r0+r1*0], m0
713
    movh [r0+r1*1], m2
714
%else
715
    packuswb  m0, m2
716
    movh   [r0+r1*0], m0
717
    movhps [r0+r1*1], m0
718
%endif
719

    
720
    lea       r0, [r0+r1*2]
721
    lea       r2, [r2+r3*2]
722
    sub       r4, 2
723
    jg .nextrow
724
    REP_RET
725

    
726
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
727
    mov      r6d, 8*16
728
    shl      r5d, 4
729
    sub      r6d, r5d
730
%ifdef PIC
731
    lea      r11, [bilinear_filter_vw_m]
732
%endif
733
    pxor      m6, m6
734
    mova      m4, [bilinear_filter_vw+r6-16]
735
    mova      m5, [bilinear_filter_vw+r5-16]
736
.nextrow
737
    movh      m0, [r2+r3*0+0]
738
    movh      m1, [r2+r3*0+1]
739
    movh      m2, [r2+r3*1+0]
740
    movh      m3, [r2+r3*1+1]
741
    punpcklbw m0, m6
742
    punpcklbw m1, m6
743
    punpcklbw m2, m6
744
    punpcklbw m3, m6
745
    pmullw    m0, m4
746
    pmullw    m1, m5
747
    pmullw    m2, m4
748
    pmullw    m3, m5
749
    paddsw    m0, m1
750
    paddsw    m2, m3
751
    psraw     m0, 2
752
    psraw     m2, 2
753
    pavgw     m0, m6
754
    pavgw     m2, m6
755
%ifidn %1, mmxext
756
    packuswb  m0, m0
757
    packuswb  m2, m2
758
    movh [r0+r1*0], m0
759
    movh [r0+r1*1], m2
760
%else
761
    packuswb  m0, m2
762
    movh   [r0+r1*0], m0
763
    movhps [r0+r1*1], m0
764
%endif
765

    
766
    lea       r0, [r0+r1*2]
767
    lea       r2, [r2+r3*2]
768
    sub       r4, 2
769
    jg .nextrow
770
    REP_RET
771
%endmacro
772

    
773
INIT_MMX
774
FILTER_BILINEAR mmxext, 4, 0
775
INIT_XMM
776
FILTER_BILINEAR   sse2, 8, 7
777

    
778
%macro FILTER_BILINEAR_SSSE3 1
779
cglobal put_vp8_bilinear%1_v_ssse3, 7,7
780
    shl      r6d, 4
781
%ifdef PIC
782
    lea      r11, [bilinear_filter_vb_m]
783
%endif
784
    pxor      m4, m4
785
    mova      m3, [bilinear_filter_vb+r6-16]
786
.nextrow
787
    movh      m0, [r2+r3*0]
788
    movh      m1, [r2+r3*1]
789
    movh      m2, [r2+r3*2]
790
    punpcklbw m0, m1
791
    punpcklbw m1, m2
792
    pmaddubsw m0, m3
793
    pmaddubsw m1, m3
794
    psraw     m0, 2
795
    psraw     m1, 2
796
    pavgw     m0, m4
797
    pavgw     m1, m4
798
%if mmsize==8
799
    packuswb  m0, m0
800
    packuswb  m1, m1
801
    movh [r0+r1*0], m0
802
    movh [r0+r1*1], m1
803
%else
804
    packuswb  m0, m1
805
    movh   [r0+r1*0], m0
806
    movhps [r0+r1*1], m0
807
%endif
808

    
809
    lea       r0, [r0+r1*2]
810
    lea       r2, [r2+r3*2]
811
    sub       r4, 2
812
    jg .nextrow
813
    REP_RET
814

    
815
cglobal put_vp8_bilinear%1_h_ssse3, 7,7
816
    shl      r5d, 4
817
%ifdef PIC
818
    lea      r11, [bilinear_filter_vb_m]
819
%endif
820
    pxor      m4, m4
821
    mova      m2, [filter_h2_shuf]
822
    mova      m3, [bilinear_filter_vb+r5-16]
823
.nextrow
824
    movu      m0, [r2+r3*0]
825
    movu      m1, [r2+r3*1]
826
    pshufb    m0, m2
827
    pshufb    m1, m2
828
    pmaddubsw m0, m3
829
    pmaddubsw m1, m3
830
    psraw     m0, 2
831
    psraw     m1, 2
832
    pavgw     m0, m4
833
    pavgw     m1, m4
834
%if mmsize==8
835
    packuswb  m0, m0
836
    packuswb  m1, m1
837
    movh [r0+r1*0], m0
838
    movh [r0+r1*1], m1
839
%else
840
    packuswb  m0, m1
841
    movh   [r0+r1*0], m0
842
    movhps [r0+r1*1], m0
843
%endif
844

    
845
    lea       r0, [r0+r1*2]
846
    lea       r2, [r2+r3*2]
847
    sub       r4, 2
848
    jg .nextrow
849
    REP_RET
850
%endmacro
851

    
852
INIT_MMX
853
FILTER_BILINEAR_SSSE3 4
854
INIT_XMM
855
FILTER_BILINEAR_SSSE3 8
856

    
857
cglobal put_vp8_pixels8_mmx, 5,5
858
.nextrow:
859
    movq  mm0, [r2+r3*0]
860
    movq  mm1, [r2+r3*1]
861
    lea    r2, [r2+r3*2]
862
    movq [r0+r1*0], mm0
863
    movq [r0+r1*1], mm1
864
    lea    r0, [r0+r1*2]
865
    sub   r4d, 2
866
    jg .nextrow
867
    REP_RET
868

    
869
cglobal put_vp8_pixels16_mmx, 5,5
870
.nextrow:
871
    movq  mm0, [r2+r3*0+0]
872
    movq  mm1, [r2+r3*0+8]
873
    movq  mm2, [r2+r3*1+0]
874
    movq  mm3, [r2+r3*1+8]
875
    lea    r2, [r2+r3*2]
876
    movq [r0+r1*0+0], mm0
877
    movq [r0+r1*0+8], mm1
878
    movq [r0+r1*1+0], mm2
879
    movq [r0+r1*1+8], mm3
880
    lea    r0, [r0+r1*2]
881
    sub   r4d, 2
882
    jg .nextrow
883
    REP_RET
884

    
885
cglobal put_vp8_pixels16_sse, 5,5,2
886
.nextrow:
887
    movups xmm0, [r2+r3*0]
888
    movups xmm1, [r2+r3*1]
889
    lea     r2, [r2+r3*2]
890
    movaps [r0+r1*0], xmm0
891
    movaps [r0+r1*1], xmm1
892
    lea     r0, [r0+r1*2]
893
    sub    r4d, 2
894
    jg .nextrow
895
    REP_RET
896

    
897
;-----------------------------------------------------------------------------
898
; IDCT functions:
899
;
900
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
901
;-----------------------------------------------------------------------------
902

    
903
cglobal vp8_idct_dc_add_mmx, 3, 3
904
    ; load data
905
    movd       mm0, [r1]
906

    
907
    ; calculate DC
908
    paddw      mm0, [pw_4]
909
    pxor       mm1, mm1
910
    psraw      mm0, 3
911
    psubw      mm1, mm0
912
    packuswb   mm0, mm0
913
    packuswb   mm1, mm1
914
    punpcklbw  mm0, mm0
915
    punpcklbw  mm1, mm1
916
    punpcklwd  mm0, mm0
917
    punpcklwd  mm1, mm1
918

    
919
    ; add DC
920
    lea         r1, [r0+r2*2]
921
    movd       mm2, [r0]
922
    movd       mm3, [r0+r2]
923
    movd       mm4, [r1]
924
    movd       mm5, [r1+r2]
925
    paddusb    mm2, mm0
926
    paddusb    mm3, mm0
927
    paddusb    mm4, mm0
928
    paddusb    mm5, mm0
929
    psubusb    mm2, mm1
930
    psubusb    mm3, mm1
931
    psubusb    mm4, mm1
932
    psubusb    mm5, mm1
933
    movd      [r0], mm2
934
    movd   [r0+r2], mm3
935
    movd      [r1], mm4
936
    movd   [r1+r2], mm5
937
    RET
938

    
939
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
940
    ; load data
941
    movd       xmm0, [r1]
942
    lea          r1, [r0+r2*2]
943
    pxor       xmm1, xmm1
944
    movq       xmm2, [pw_4]
945

    
946
    ; calculate DC
947
    paddw      xmm0, xmm2
948
    movd       xmm2, [r0]
949
    movd       xmm3, [r0+r2]
950
    movd       xmm4, [r1]
951
    movd       xmm5, [r1+r2]
952
    psraw      xmm0, 3
953
    pshuflw    xmm0, xmm0, 0
954
    punpcklqdq xmm0, xmm0
955
    punpckldq  xmm2, xmm3
956
    punpckldq  xmm4, xmm5
957
    punpcklbw  xmm2, xmm1
958
    punpcklbw  xmm4, xmm1
959
    paddw      xmm2, xmm0
960
    paddw      xmm4, xmm0
961
    packuswb   xmm2, xmm4
962
    movd       [r0], xmm2
963
    pextrd  [r0+r2], xmm2, 1
964
    pextrd     [r1], xmm2, 2
965
    pextrd  [r1+r2], xmm2, 3
966
    RET
967

    
968
;-----------------------------------------------------------------------------
969
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
970
;-----------------------------------------------------------------------------
971

    
972
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
973
;           this macro assumes that m6/m7 have words for 20091/17734 loaded
974
%macro VP8_MULTIPLY_SUMSUB 4
975
    mova      %3, %1
976
    mova      %4, %2
977
    pmulhw    %3, m6 ;20091(1)
978
    pmulhw    %4, m6 ;20091(2)
979
    paddw     %3, %1
980
    paddw     %4, %2
981
    paddw     %1, %1
982
    paddw     %2, %2
983
    pmulhw    %1, m7 ;35468(1)
984
    pmulhw    %2, m7 ;35468(2)
985
    psubw     %1, %4
986
    paddw     %2, %3
987
%endmacro
988

    
989
; calculate x0=%1+%3; x1=%1-%3
990
;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
991
;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
992
;           %5/%6 are temporary registers
993
;           we assume m6/m7 have constant words 20091/17734 loaded in them
994
%macro VP8_IDCT_TRANSFORM4x4_1D 6
995
    SUMSUB_BA           m%3, m%1, m%5     ;t0, t1
996
    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
997
    SUMSUB_BA           m%4, m%3, m%5     ;tmp0, tmp3
998
    SUMSUB_BA           m%2, m%1, m%5     ;tmp1, tmp2
999
    SWAP                 %4,  %1
1000
    SWAP                 %4,  %3
1001
%endmacro
1002

    
1003
INIT_MMX
1004
cglobal vp8_idct_add_mmx, 3, 3
1005
    ; load block data
1006
    movq         m0, [r1]
1007
    movq         m1, [r1+8]
1008
    movq         m2, [r1+16]
1009
    movq         m3, [r1+24]
1010
    movq         m6, [pw_20091]
1011
    movq         m7, [pw_17734]
1012

    
1013
    ; actual IDCT
1014
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1015
    TRANSPOSE4x4W            0, 1, 2, 3, 4
1016
    paddw        m0, [pw_4]
1017
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1018
    TRANSPOSE4x4W            0, 1, 2, 3, 4
1019

    
1020
    ; store
1021
    pxor         m4, m4
1022
    lea          r1, [r0+2*r2]
1023
    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
1024
    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
1025

    
1026
    RET
1027

    
1028
;-----------------------------------------------------------------------------
1029
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
1030
;-----------------------------------------------------------------------------
1031

    
1032
%macro SCATTER_WHT 1
1033
    pextrw r1d, m0, %1
1034
    pextrw r2d, m1, %1
1035
    mov [r0+2*16*0], r1w
1036
    mov [r0+2*16*1], r2w
1037
    pextrw r1d, m2, %1
1038
    pextrw r2d, m3, %1
1039
    mov [r0+2*16*2], r1w
1040
    mov [r0+2*16*3], r2w
1041
%endmacro
1042

    
1043
%macro HADAMARD4_1D 4
1044
    SUMSUB_BADC m%2, m%1, m%4, m%3
1045
    SUMSUB_BADC m%4, m%2, m%3, m%1
1046
    SWAP %1, %4, %3
1047
%endmacro
1048

    
1049
INIT_MMX
1050
cglobal vp8_luma_dc_wht_mmxext, 2,3
1051
    movq          m0, [r1]
1052
    movq          m1, [r1+8]
1053
    movq          m2, [r1+16]
1054
    movq          m3, [r1+24]
1055
    HADAMARD4_1D  0, 1, 2, 3
1056
    TRANSPOSE4x4W 0, 1, 2, 3, 4
1057
    paddw         m0, [pw_3]
1058
    HADAMARD4_1D  0, 1, 2, 3
1059
    psraw         m0, 3
1060
    psraw         m1, 3
1061
    psraw         m2, 3
1062
    psraw         m3, 3
1063
    SCATTER_WHT   0
1064
    add           r0, 2*16*4
1065
    SCATTER_WHT   1
1066
    add           r0, 2*16*4
1067
    SCATTER_WHT   2
1068
    add           r0, 2*16*4
1069
    SCATTER_WHT   3
1070
    RET
1071

    
1072
;-----------------------------------------------------------------------------
1073
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
1074
;-----------------------------------------------------------------------------
1075

    
1076
; macro called with 7 mm register indexes as argument, and 4 regular registers
1077
;
1078
; first 4 mm registers will carry the transposed pixel data
1079
; the other three are scratchspace (one would be sufficient, but this allows
1080
; for more spreading/pipelining and thus faster execution on OOE CPUs)
1081
;
1082
; first two regular registers are buf+4*stride and buf+5*stride
1083
; third is -stride, fourth is +stride
1084
%macro READ_8x4_INTERLEAVED 11
1085
    ; interleave 8 (A-H) rows of 4 pixels each
1086
    movd          m%1, [%8+%10*4]   ; A0-3
1087
    movd          m%5, [%9+%10*4]   ; B0-3
1088
    movd          m%2, [%8+%10*2]   ; C0-3
1089
    movd          m%6, [%8+%10]     ; D0-3
1090
    movd          m%3, [%8]         ; E0-3
1091
    movd          m%7, [%9]         ; F0-3
1092
    movd          m%4, [%9+%11]     ; G0-3
1093
    punpcklbw     m%1, m%5          ; A/B interleaved
1094
    movd          m%5, [%9+%11*2]   ; H0-3
1095
    punpcklbw     m%2, m%6          ; C/D interleaved
1096
    punpcklbw     m%3, m%7          ; E/F interleaved
1097
    punpcklbw     m%4, m%5          ; G/H interleaved
1098
%endmacro
1099

    
1100
; macro called with 7 mm register indexes as argument, and 5 regular registers
1101
; first 11 mean the same as READ_8x4_TRANSPOSED above
1102
; fifth regular register is scratchspace to reach the bottom 8 rows, it
1103
; will be set to second regular register + 8*stride at the end
1104
%macro READ_16x4_INTERLEAVED 12
1105
    ; transpose 16 (A-P) rows of 4 pixels each
1106
    lea           %12, [r0+8*r2]
1107

    
1108
    ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
1109
    movd          m%1, [%8+%10*4]   ; A0-3
1110
    movd          m%3, [%12+%10*4]  ; I0-3
1111
    movd          m%2, [%8+%10*2]   ; C0-3
1112
    movd          m%4, [%12+%10*2]  ; K0-3
1113
    movd          m%6, [%8+%10]     ; D0-3
1114
    movd          m%5, [%12+%10]    ; L0-3
1115
    movd          m%7, [%12]        ; M0-3
1116
    add           %12, %11
1117
    punpcklbw     m%1, m%3          ; A/I
1118
    movd          m%3, [%8]         ; E0-3
1119
    punpcklbw     m%2, m%4          ; C/K
1120
    punpcklbw     m%6, m%5          ; D/L
1121
    punpcklbw     m%3, m%7          ; E/M
1122
    punpcklbw     m%2, m%6          ; C/D/K/L interleaved
1123

    
1124
    ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
1125
    movd         m%5, [%9+%10*4]   ; B0-3
1126
    movd         m%4, [%12+%10*4]  ; J0-3
1127
    movd         m%7, [%9]         ; F0-3
1128
    movd         m%6, [%12]        ; N0-3
1129
    punpcklbw    m%5, m%4          ; B/J
1130
    punpcklbw    m%7, m%6          ; F/N
1131
    punpcklbw    m%1, m%5          ; A/B/I/J interleaved
1132
    punpcklbw    m%3, m%7          ; E/F/M/N interleaved
1133
    movd         m%4, [%9+%11]     ; G0-3
1134
    movd         m%6, [%12+%11]    ; O0-3
1135
    movd         m%5, [%9+%11*2]   ; H0-3
1136
    movd         m%7, [%12+%11*2]  ; P0-3
1137
    punpcklbw    m%4, m%6          ; G/O
1138
    punpcklbw    m%5, m%7          ; H/P
1139
    punpcklbw    m%4, m%5          ; G/H/O/P interleaved
1140
%endmacro
1141

    
1142
; write 4 mm registers of 2 dwords each
1143
; first four arguments are mm register indexes containing source data
1144
; last four are registers containing buf+4*stride, buf+5*stride,
1145
; -stride and +stride
1146
%macro WRITE_4x2D 8
1147
    ; write out (2 dwords per register)
1148
    movd    [%5+%7*4], m%1
1149
    movd    [%5+%7*2], m%2
1150
    movd         [%5], m%3
1151
    movd      [%6+%8], m%4
1152
    punpckhdq     m%1, m%1
1153
    punpckhdq     m%2, m%2
1154
    punpckhdq     m%3, m%3
1155
    punpckhdq     m%4, m%4
1156
    movd    [%6+%7*4], m%1
1157
    movd      [%5+%7], m%2
1158
    movd         [%6], m%3
1159
    movd    [%6+%8*2], m%4
1160
%endmacro
1161

    
1162
; write 4 xmm registers of 4 dwords each
1163
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
1164
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
1165
; we add 1*stride to the third regular registry in the process
1166
%macro WRITE_4x4D 9
1167
    ; write out (4 dwords per register), start with dwords zero
1168
    movd    [%5+%8*4], m%1
1169
    movd         [%5], m%2
1170
    movd    [%5+%9*4], m%3
1171
    movd    [%5+%9*8], m%4
1172

    
1173
    ; store dwords 1
1174
    psrldq        m%1, 4
1175
    psrldq        m%2, 4
1176
    psrldq        m%3, 4
1177
    psrldq        m%4, 4
1178
    movd    [%6+%8*4], m%1
1179
    movd         [%6], m%2
1180
    movd    [%6+%9*4], m%3
1181
    movd    [%6+%9*8], m%4
1182

    
1183
    ; write dwords 2
1184
    psrldq        m%1, 4
1185
    psrldq        m%2, 4
1186
    psrldq        m%3, 4
1187
    psrldq        m%4, 4
1188
    movd    [%5+%8*2], m%1
1189
    movd      [%6+%9], m%2
1190
    movd    [%7+%8*2], m%3
1191
    movd    [%7+%9*2], m%4
1192
    add            %7, %9
1193

    
1194
    ; store dwords 3
1195
    psrldq        m%1, 4
1196
    psrldq        m%2, 4
1197
    psrldq        m%3, 4
1198
    psrldq        m%4, 4
1199
    movd      [%5+%8], m%1
1200
    movd    [%6+%9*2], m%2
1201
    movd    [%7+%8*2], m%3
1202
    movd    [%7+%9*2], m%4
1203
%endmacro
1204

    
1205
%macro SIMPLE_LOOPFILTER 3
1206
cglobal vp8_%2_loop_filter_simple_%1, 3, %3
1207
%ifidn %2, h
1208
    mov            r5, rsp          ; backup stack pointer
1209
    and           rsp, ~(mmsize-1)  ; align stack
1210
%endif
1211
%if mmsize == 8 ; mmx/mmxext
1212
    mov            r3, 2
1213
%endif
1214

    
1215
    ; splat register with "flim"
1216
    movd           m7, r2
1217
    punpcklbw      m7, m7
1218
%if mmsize == 16 ; sse2
1219
    punpcklwd      m7, m7
1220
    pshufd         m7, m7, 0x0
1221
%elifidn %1, mmx
1222
    punpcklwd      m7, m7
1223
    punpckldq      m7, m7
1224
%else ; mmxext
1225
    pshufw         m7, m7, 0x0
1226
%endif
1227

    
1228
    ; set up indexes to address 4 rows
1229
    mov            r2, r1
1230
    neg            r1
1231
%ifidn %2, h
1232
    lea            r0, [r0+4*r2-2]
1233
    sub           rsp, mmsize*2     ; (aligned) storage space for saving p1/q1
1234
%endif
1235

    
1236
%if mmsize == 8 ; mmx / mmxext
1237
.next8px
1238
%endif
1239
%ifidn %2, v
1240
    ; read 4 half/full rows of pixels
1241
    mova           m0, [r0+r1*2]    ; p1
1242
    mova           m1, [r0+r1]      ; p0
1243
    mova           m2, [r0]         ; q0
1244
    mova           m3, [r0+r2]      ; q1
1245
%else ; h
1246
    lea            r4, [r0+r2]
1247

    
1248
%if mmsize == 8 ; mmx/mmxext
1249
    READ_8x4_INTERLEAVED  0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
1250
%else ; sse2
1251
    READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
1252
%endif
1253
    TRANSPOSE4x4W         0, 1, 2, 3, 4
1254

    
1255
    mova        [rsp], m0           ; store p1
1256
    mova [rsp+mmsize], m3           ; store q1
1257
%endif
1258

    
1259
    ; simple_limit
1260
    mova           m5, m2           ; m5=backup of q0
1261
    mova           m6, m1           ; m6=backup of p0
1262
    psubusb        m1, m2           ; p0-q0
1263
    psubusb        m2, m6           ; q0-p0
1264
    por            m1, m2           ; FFABS(p0-q0)
1265
    paddusb        m1, m1           ; m1=FFABS(p0-q0)*2
1266

    
1267
    mova           m4, m3
1268
    mova           m2, m0
1269
    psubusb        m3, m0           ; q1-p1
1270
    psubusb        m0, m4           ; p1-q1
1271
    por            m3, m0           ; FFABS(p1-q1)
1272
    mova           m0, [pb_80]
1273
    pxor           m2, m0
1274
    pxor           m4, m0
1275
    psubsb         m2, m4           ; m2=p1-q1 (signed) backup for below
1276
    pand           m3, [pb_FE]
1277
    psrlq          m3, 1            ; m3=FFABS(p1-q1)/2, this can be used signed
1278
    paddusb        m3, m1
1279
    psubusb        m3, m7
1280
    pxor           m1, m1
1281
    pcmpeqb        m3, m1           ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
1282

    
1283
    ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
1284
    mova           m4, m5
1285
    pxor           m5, m0
1286
    pxor           m0, m6
1287
    psubsb         m5, m0           ; q0-p0 (signed)
1288
    paddsb         m2, m5
1289
    paddsb         m2, m5
1290
    paddsb         m2, m5           ; a=(p1-q1) + 3*(q0-p0)
1291
    pand           m2, m3           ; apply filter mask (m3)
1292

    
1293
    mova           m3, [pb_F8]
1294
    mova           m1, m2
1295
    paddsb         m2, [pb_4]       ; f1<<3=a+4
1296
    paddsb         m1, [pb_3]       ; f2<<3=a+3
1297
    pand           m2, m3
1298
    pand           m1, m3           ; cache f2<<3
1299

    
1300
    pxor           m0, m0
1301
    pxor           m3, m3
1302
    pcmpgtb        m0, m2           ; which values are <0?
1303
    psubb          m3, m2           ; -f1<<3
1304
    psrlq          m2, 3            ; +f1
1305
    psrlq          m3, 3            ; -f1
1306
    pand           m3, m0
1307
    pandn          m0, m2
1308
    psubusb        m4, m0
1309
    paddusb        m4, m3           ; q0-f1
1310

    
1311
    pxor           m0, m0
1312
    pxor           m3, m3
1313
    pcmpgtb        m0, m1           ; which values are <0?
1314
    psubb          m3, m1           ; -f2<<3
1315
    psrlq          m1, 3            ; +f2
1316
    psrlq          m3, 3            ; -f2
1317
    pand           m3, m0
1318
    pandn          m0, m1
1319
    paddusb        m6, m0
1320
    psubusb        m6, m3           ; p0+f2
1321

    
1322
    ; store
1323
%ifidn %2, v
1324
    mova         [r0], m4
1325
    mova      [r0+r1], m6
1326
%else ; h
1327
    mova           m0, [rsp]        ; p1
1328
    SWAP            2, 4            ; p0
1329
    SWAP            1, 6            ; q0
1330
    mova           m3, [rsp+mmsize] ; q1
1331

    
1332
    TRANSPOSE4x4B  0, 1, 2, 3, 4
1333
%if mmsize == 16 ; sse2
1334
    add            r3, r1           ; change from r4*8*stride to r0+8*stride
1335
    WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2
1336
%else ; mmx/mmxext
1337
    WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2
1338
%endif
1339
%endif
1340

    
1341
%if mmsize == 8 ; mmx/mmxext
1342
    ; next 8 pixels
1343
%ifidn %2, v
1344
    add            r0, 8            ; advance 8 cols = pixels
1345
%else ; h
1346
    lea            r0, [r0+r2*8]    ; advance 8 rows = lines
1347
%endif
1348
    dec            r3
1349
    jg .next8px
1350
%ifidn %2, v
1351
    REP_RET
1352
%else ; h
1353
    mov           rsp, r5           ; restore stack pointer
1354
    RET
1355
%endif
1356
%else ; sse2
1357
%ifidn %2, h
1358
    mov           rsp, r5           ; restore stack pointer
1359
%endif
1360
    RET
1361
%endif
1362
%endmacro
1363

    
1364
INIT_MMX
1365
SIMPLE_LOOPFILTER mmx,    v, 4
1366
SIMPLE_LOOPFILTER mmx,    h, 6
1367
SIMPLE_LOOPFILTER mmxext, v, 4
1368
SIMPLE_LOOPFILTER mmxext, h, 6
1369
INIT_XMM
1370
SIMPLE_LOOPFILTER sse2,   v, 3
1371
SIMPLE_LOOPFILTER sse2,   h, 6