Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / vp8dsp.asm @ 2dd2f716

History | View | Annotate | Download (28.4 KB)

1
;******************************************************************************
2
;* VP8 MMXEXT optimizations
3
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5
;*
6
;* This file is part of FFmpeg.
7
;*
8
;* FFmpeg is free software; you can redistribute it and/or
9
;* modify it under the terms of the GNU Lesser General Public
10
;* License as published by the Free Software Foundation; either
11
;* version 2.1 of the License, or (at your option) any later version.
12
;*
13
;* FFmpeg is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
;* Lesser General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU Lesser General Public
19
;* License along with FFmpeg; if not, write to the Free Software
20
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
;******************************************************************************
22

    
23
%include "x86inc.asm"
24
%include "x86util.asm"
25

    
26
SECTION_RODATA
27

    
28
fourtap_filter_hw_m: times 4 dw  -6, 123
29
                     times 4 dw  12,  -1
30
                     times 4 dw  -9,  93
31
                     times 4 dw  50,  -6
32
                     times 4 dw  -6,  50
33
                     times 4 dw  93,  -9
34
                     times 4 dw  -1,  12
35
                     times 4 dw 123,  -6
36

    
37
sixtap_filter_hw_m:  times 4 dw   2, -11
38
                     times 4 dw 108,  36
39
                     times 4 dw  -8,   1
40
                     times 4 dw   3, -16
41
                     times 4 dw  77,  77
42
                     times 4 dw -16,   3
43
                     times 4 dw   1,  -8
44
                     times 4 dw  36, 108
45
                     times 4 dw -11,   2
46

    
47
fourtap_filter_hb_m: times 8 db  -6,  -1
48
                     times 8 db 123,  12
49
                     times 8 db  -9,  -6
50
                     times 8 db  93,  50
51
                     times 8 db  -6,  -9
52
                     times 8 db  50,  93
53
                     times 8 db  -1,  -6
54
                     times 8 db  12, 123
55

    
56
sixtap_filter_hb_m:  times 8 db   2,   1
57
                     times 8 db -11, 108
58
                     times 8 db  36,  -8
59
                     times 8 db   3,   3
60
                     times 8 db -16,  77
61
                     times 8 db  77, -16
62
                     times 8 db   1,   2
63
                     times 8 db  -8,  36
64
                     times 8 db 108, -11
65

    
66
fourtap_filter_v_m:  times 8 dw  -6
67
                     times 8 dw 123
68
                     times 8 dw  12
69
                     times 8 dw  -1
70
                     times 8 dw  -9
71
                     times 8 dw  93
72
                     times 8 dw  50
73
                     times 8 dw  -6
74
                     times 8 dw  -6
75
                     times 8 dw  50
76
                     times 8 dw  93
77
                     times 8 dw  -9
78
                     times 8 dw  -1
79
                     times 8 dw  12
80
                     times 8 dw 123
81
                     times 8 dw  -6
82

    
83
sixtap_filter_v_m:   times 8 dw   2
84
                     times 8 dw -11
85
                     times 8 dw 108
86
                     times 8 dw  36
87
                     times 8 dw  -8
88
                     times 8 dw   1
89
                     times 8 dw   3
90
                     times 8 dw -16
91
                     times 8 dw  77
92
                     times 8 dw  77
93
                     times 8 dw -16
94
                     times 8 dw   3
95
                     times 8 dw   1
96
                     times 8 dw  -8
97
                     times 8 dw  36
98
                     times 8 dw 108
99
                     times 8 dw -11
100
                     times 8 dw   2
101

    
102
bilinear_filter_vw_m: times 8 dw 1
103
                      times 8 dw 2
104
                      times 8 dw 3
105
                      times 8 dw 4
106
                      times 8 dw 5
107
                      times 8 dw 6
108
                      times 8 dw 7
109

    
110
bilinear_filter_vb_m: times 8 db 7, 1
111
                      times 8 db 6, 2
112
                      times 8 db 5, 3
113
                      times 8 db 4, 4
114
                      times 8 db 3, 5
115
                      times 8 db 2, 6
116
                      times 8 db 1, 7
117

    
118
%ifdef PIC
119
%define fourtap_filter_hw    r11
120
%define sixtap_filter_hw     r11
121
%define fourtap_filter_hb    r11
122
%define sixtap_filter_hb     r11
123
%define fourtap_filter_v     r11
124
%define sixtap_filter_v      r11
125
%define bilinear_filter_vw   r11
126
%define bilinear_filter_vb   r11
127
%else
128
%define fourtap_filter_hw fourtap_filter_hw_m
129
%define sixtap_filter_hw  sixtap_filter_hw_m
130
%define fourtap_filter_hb fourtap_filter_hb_m
131
%define sixtap_filter_hb  sixtap_filter_hb_m
132
%define fourtap_filter_v  fourtap_filter_v_m
133
%define sixtap_filter_v   sixtap_filter_v_m
134
%define bilinear_filter_vw bilinear_filter_vw_m
135
%define bilinear_filter_vb bilinear_filter_vb_m
136
%endif
137

    
138
filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
139
filter_h4_shuf:  db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5,  8, 6,  9,  7, 10
140

    
141
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
142
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
143
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
144

    
145
pw_20091: times 4 dw 20091
146
pw_17734: times 4 dw 17734
147

    
148
cextern pw_3
149
cextern pw_4
150
cextern pw_64
151

    
152
SECTION .text
153

    
154
;-----------------------------------------------------------------------------
155
; subpel MC functions:
156
;
157
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
158
;                                              uint8_t *src, int srcstride,
159
;                                              int height,   int mx, int my);
160
;-----------------------------------------------------------------------------
161

    
162
; 4x4 block, H-only 4-tap filter
163
cglobal put_vp8_epel4_h4_mmxext, 6, 6
164
    shl       r5d, 4
165
%ifdef PIC
166
    lea       r11, [fourtap_filter_hw_m]
167
%endif
168
    movq      mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
169
    movq      mm5, [fourtap_filter_hw+r5]
170
    movq      mm7, [pw_64]
171
    pxor      mm6, mm6
172

    
173
.nextrow
174
    movq      mm1, [r2-1]                  ; (ABCDEFGH) load 8 horizontal pixels
175

    
176
    ; first set of 2 pixels
177
    movq      mm2, mm1                     ; byte ABCD..
178
    punpcklbw mm1, mm6                     ; byte->word ABCD
179
    pshufw    mm0, mm2, 9                  ; byte CDEF..
180
    punpcklbw mm0, mm6                     ; byte->word CDEF
181
    pshufw    mm3, mm1, 0x94               ; word ABBC
182
    pshufw    mm1, mm0, 0x94               ; word CDDE
183
    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
184
    movq      mm0, mm1                     ; backup for second set of pixels
185
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
186
    paddd     mm3, mm1                     ; finish 1st 2px
187

    
188
    ; second set of 2 pixels, use backup of above
189
    punpckhbw mm2, mm6                     ; byte->word EFGH
190
    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
191
    pshufw    mm1, mm2, 0x94               ; word EFFG
192
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
193
    paddd     mm0, mm1                     ; finish 2nd 2px
194

    
195
    ; merge two sets of 2 pixels into one set of 4, round/clip/store
196
    packssdw  mm3, mm0                     ; merge dword->word (4px)
197
    paddsw    mm3, mm7                     ; rounding
198
    psraw     mm3, 7
199
    packuswb  mm3, mm6                     ; clip and word->bytes
200
    movd     [r0], mm3                     ; store
201

    
202
    ; go to next line
203
    add        r0, r1
204
    add        r2, r3
205
    dec        r4                          ; next row
206
    jg .nextrow
207
    REP_RET
208

    
209
; 4x4 block, H-only 6-tap filter
210
cglobal put_vp8_epel4_h6_mmxext, 6, 6
211
    lea       r5d, [r5*3]
212
%ifdef PIC
213
    lea       r11, [sixtap_filter_hw_m]
214
%endif
215
    movq      mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
216
    movq      mm5, [sixtap_filter_hw+r5*8-32]
217
    movq      mm6, [sixtap_filter_hw+r5*8-16]
218
    movq      mm7, [pw_64]
219
    pxor      mm3, mm3
220

    
221
.nextrow
222
    movq      mm1, [r2-2]                  ; (ABCDEFGH) load 8 horizontal pixels
223

    
224
    ; first set of 2 pixels
225
    movq      mm2, mm1                     ; byte ABCD..
226
    punpcklbw mm1, mm3                     ; byte->word ABCD
227
    pshufw    mm0, mm2, 0x9                ; byte CDEF..
228
    punpckhbw mm2, mm3                     ; byte->word EFGH
229
    punpcklbw mm0, mm3                     ; byte->word CDEF
230
    pshufw    mm1, mm1, 0x94               ; word ABBC
231
    pshufw    mm2, mm2, 0x94               ; word EFFG
232
    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
233
    pshufw    mm3, mm0, 0x94               ; word CDDE
234
    movq      mm0, mm3                     ; backup for second set of pixels
235
    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
236
    paddd     mm1, mm3                     ; add to 1st 2px cache
237
    movq      mm3, mm2                     ; backup for second set of pixels
238
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
239
    paddd     mm1, mm2                     ; finish 1st 2px
240

    
241
    ; second set of 2 pixels, use backup of above
242
    movd      mm2, [r2+3]                  ; byte FGHI (prevent overreads)
243
    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
244
    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
245
    paddd     mm0, mm3                     ; add to 2nd 2px cache
246
    pxor      mm3, mm3
247
    punpcklbw mm2, mm3                     ; byte->word FGHI
248
    pshufw    mm2, mm2, 0xE9               ; word GHHI
249
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
250
    paddd     mm0, mm2                     ; finish 2nd 2px
251

    
252
    ; merge two sets of 2 pixels into one set of 4, round/clip/store
253
    packssdw  mm1, mm0                     ; merge dword->word (4px)
254
    paddsw    mm1, mm7                     ; rounding
255
    psraw     mm1, 7
256
    packuswb  mm1, mm3                     ; clip and word->bytes
257
    movd     [r0], mm1                     ; store
258

    
259
    ; go to next line
260
    add        r0, r1
261
    add        r2, r3
262
    dec        r4                          ; next row
263
    jg .nextrow
264
    REP_RET
265

    
266
; 4x4 block, H-only 4-tap filter
267
INIT_XMM
268
cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
269
    shl      r5d, 4
270
%ifdef PIC
271
    lea      r11, [fourtap_filter_hw_m]
272
%endif
273
    mova      m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
274
    mova      m6, [fourtap_filter_hw+r5]
275
    pxor      m7, m7
276

    
277
.nextrow
278
    movh      m0, [r2-1]
279
    punpcklbw m0, m7        ; ABCDEFGH
280
    mova      m1, m0
281
    mova      m2, m0
282
    mova      m3, m0
283
    psrldq    m1, 2         ; BCDEFGH
284
    psrldq    m2, 4         ; CDEFGH
285
    psrldq    m3, 6         ; DEFGH
286
    punpcklwd m0, m1        ; ABBCCDDE
287
    punpcklwd m2, m3        ; CDDEEFFG
288
    pmaddwd   m0, m5
289
    pmaddwd   m2, m6
290
    paddd     m0, m2
291

    
292
    movh      m1, [r2+3]
293
    punpcklbw m1, m7        ; ABCDEFGH
294
    mova      m2, m1
295
    mova      m3, m1
296
    mova      m4, m1
297
    psrldq    m2, 2         ; BCDEFGH
298
    psrldq    m3, 4         ; CDEFGH
299
    psrldq    m4, 6         ; DEFGH
300
    punpcklwd m1, m2        ; ABBCCDDE
301
    punpcklwd m3, m4        ; CDDEEFFG
302
    pmaddwd   m1, m5
303
    pmaddwd   m3, m6
304
    paddd     m1, m3
305

    
306
    packssdw  m0, m1
307
    paddsw    m0, [pw_64]
308
    psraw     m0, 7
309
    packuswb  m0, m7
310
    movh    [r0], m0        ; store
311

    
312
    ; go to next line
313
    add       r0, r1
314
    add       r2, r3
315
    dec       r4            ; next row
316
    jg .nextrow
317
    REP_RET
318

    
319
cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
320
    lea      r5d, [r5*3]
321
%ifdef PIC
322
    lea      r11, [sixtap_filter_hw_m]
323
%endif
324
    lea       r5, [sixtap_filter_hw+r5*8]
325
    pxor      m7, m7
326

    
327
.nextrow
328
    movu      m0, [r2-2]
329
    mova      m6, m0
330
    mova      m4, m0
331
    punpcklbw m0, m7        ; ABCDEFGHI
332
    mova      m1, m0
333
    mova      m2, m0
334
    mova      m3, m0
335
    psrldq    m1, 2         ; BCDEFGH
336
    psrldq    m2, 4         ; CDEFGH
337
    psrldq    m3, 6         ; DEFGH
338
    psrldq    m4, 4
339
    punpcklbw m4, m7        ; EFGH
340
    mova      m5, m4
341
    psrldq    m5, 2         ; FGH
342
    punpcklwd m0, m1        ; ABBCCDDE
343
    punpcklwd m2, m3        ; CDDEEFFG
344
    punpcklwd m4, m5        ; EFFGGHHI
345
    pmaddwd   m0, [r5-48]
346
    pmaddwd   m2, [r5-32]
347
    pmaddwd   m4, [r5-16]
348
    paddd     m0, m2
349
    paddd     m0, m4
350

    
351
    psrldq    m6, 4
352
    mova      m4, m6
353
    punpcklbw m6, m7        ; ABCDEFGHI
354
    mova      m1, m6
355
    mova      m2, m6
356
    mova      m3, m6
357
    psrldq    m1, 2         ; BCDEFGH
358
    psrldq    m2, 4         ; CDEFGH
359
    psrldq    m3, 6         ; DEFGH
360
    psrldq    m4, 4
361
    punpcklbw m4, m7        ; EFGH
362
    mova      m5, m4
363
    psrldq    m5, 2         ; FGH
364
    punpcklwd m6, m1        ; ABBCCDDE
365
    punpcklwd m2, m3        ; CDDEEFFG
366
    punpcklwd m4, m5        ; EFFGGHHI
367
    pmaddwd   m6, [r5-48]
368
    pmaddwd   m2, [r5-32]
369
    pmaddwd   m4, [r5-16]
370
    paddd     m6, m2
371
    paddd     m6, m4
372

    
373
    packssdw  m0, m6
374
    paddsw    m0, [pw_64]
375
    psraw     m0, 7
376
    packuswb  m0, m7
377
    movh    [r0], m0        ; store
378

    
379
    ; go to next line
380
    add       r0, r1
381
    add       r2, r3
382
    dec       r4            ; next row
383
    jg .nextrow
384
    REP_RET
385

    
386
cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
387
    shl      r5d, 4
388
    mova      m2, [pw_64]
389
    mova      m3, [filter_h4_shuf]
390
    mova      m4, [filter_h6_shuf2]
391
%ifdef PIC
392
    lea      r11, [fourtap_filter_hb_m]
393
%endif
394
    mova      m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
395
    mova      m6, [fourtap_filter_hb+r5]
396

    
397
.nextrow
398
    movu      m0, [r2-1]
399
    mova      m1, m0
400
    pshufb    m0, m3
401
    pshufb    m1, m4
402
    pmaddubsw m0, m5
403
    pmaddubsw m1, m6
404
    paddsw    m0, m2
405
    paddsw    m0, m1
406
    psraw     m0, 7
407
    packuswb  m0, m0
408
    movh    [r0], m0        ; store
409

    
410
    ; go to next line
411
    add       r0, r1
412
    add       r2, r3
413
    dec       r4            ; next row
414
    jg .nextrow
415
    REP_RET
416

    
417
cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
418
    lea      r5d, [r5*3]
419
    mova      m3, [filter_h6_shuf1]
420
    mova      m4, [filter_h6_shuf2]
421
%ifdef PIC
422
    lea      r11, [sixtap_filter_hb_m]
423
%endif
424
    mova      m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
425
    mova      m6, [sixtap_filter_hb+r5*8-32]
426
    mova      m7, [sixtap_filter_hb+r5*8-16]
427

    
428
.nextrow
429
    movu      m0, [r2-2]
430
    mova      m1, m0
431
    mova      m2, m0
432
    pshufb    m0, m3
433
    pshufb    m1, m4
434
    pshufb    m2, [filter_h6_shuf3]
435
    pmaddubsw m0, m5
436
    pmaddubsw m1, m6
437
    pmaddubsw m2, m7
438
    paddsw    m0, m1
439
    paddsw    m0, m2
440
    paddsw    m0, [pw_64]
441
    psraw     m0, 7
442
    packuswb  m0, m0
443
    movh    [r0], m0        ; store
444

    
445
    ; go to next line
446
    add       r0, r1
447
    add       r2, r3
448
    dec       r4            ; next row
449
    jg .nextrow
450
    REP_RET
451

    
452
%macro FILTER_V 3
453
; 4x4 block, V-only 4-tap filter
454
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
455
    shl      r6d, 5
456
%ifdef PIC
457
    lea      r11, [fourtap_filter_v_m]
458
%endif
459
    lea       r6, [fourtap_filter_v+r6-32]
460
    mova      m6, [pw_64]
461
    pxor      m7, m7
462
    mova      m5, [r6+48]
463

    
464
    ; read 3 lines
465
    sub       r2, r3
466
    movh      m0, [r2]
467
    movh      m1, [r2+  r3]
468
    movh      m2, [r2+2*r3]
469
    add       r2, r3
470
    punpcklbw m0, m7
471
    punpcklbw m1, m7
472
    punpcklbw m2, m7
473

    
474
.nextrow
475
    ; first calculate negative taps (to prevent losing positive overflows)
476
    movh      m4, [r2+2*r3]                ; read new row
477
    punpcklbw m4, m7
478
    mova      m3, m4
479
    pmullw    m0, [r6+0]
480
    pmullw    m4, m5
481
    paddsw    m4, m0
482

    
483
    ; then calculate positive taps
484
    mova      m0, m1
485
    pmullw    m1, [r6+16]
486
    paddsw    m4, m1
487
    mova      m1, m2
488
    pmullw    m2, [r6+32]
489
    paddsw    m4, m2
490
    mova      m2, m3
491

    
492
    ; round/clip/store
493
    paddsw    m4, m6
494
    psraw     m4, 7
495
    packuswb  m4, m7
496
    movh    [r0], m4
497

    
498
    ; go to next line
499
    add       r0, r1
500
    add       r2, r3
501
    dec       r4                           ; next row
502
    jg .nextrow
503
    REP_RET
504

    
505

    
506
; 4x4 block, V-only 6-tap filter
507
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
508
    shl      r6d, 4
509
    lea       r6, [r6*3]
510
%ifdef PIC
511
    lea      r11, [sixtap_filter_v_m]
512
%endif
513
    lea       r6, [sixtap_filter_v+r6-96]
514
    pxor      m7, m7
515

    
516
    ; read 5 lines
517
    sub       r2, r3
518
    sub       r2, r3
519
    movh      m0, [r2]
520
    movh      m1, [r2+r3]
521
    movh      m2, [r2+r3*2]
522
    lea       r2, [r2+r3*2]
523
    add       r2, r3
524
    movh      m3, [r2]
525
    movh      m4, [r2+r3]
526
    punpcklbw m0, m7
527
    punpcklbw m1, m7
528
    punpcklbw m2, m7
529
    punpcklbw m3, m7
530
    punpcklbw m4, m7
531

    
532
.nextrow
533
    ; first calculate negative taps (to prevent losing positive overflows)
534
    mova      m5, m1
535
    pmullw    m5, [r6+16]
536
    mova      m6, m4
537
    pmullw    m6, [r6+64]
538
    paddsw    m6, m5
539

    
540
    ; then calculate positive taps
541
    movh      m5, [r2+2*r3]                ; read new row
542
    punpcklbw m5, m7
543
    pmullw    m0, [r6+0]
544
    paddsw    m6, m0
545
    mova      m0, m1
546
    mova      m1, m2
547
    pmullw    m2, [r6+32]
548
    paddsw    m6, m2
549
    mova      m2, m3
550
    pmullw    m3, [r6+48]
551
    paddsw    m6, m3
552
    mova      m3, m4
553
    mova      m4, m5
554
    pmullw    m5, [r6+80]
555
    paddsw    m6, m5
556

    
557
    ; round/clip/store
558
    paddsw    m6, [pw_64]
559
    psraw     m6, 7
560
    packuswb  m6, m7
561
    movh    [r0], m6
562

    
563
    ; go to next line
564
    add       r0, r1
565
    add       r2, r3
566
    dec       r4                           ; next row
567
    jg .nextrow
568
    REP_RET
569
%endmacro
570

    
571
INIT_MMX
572
FILTER_V mmxext, 4, 0
573
INIT_XMM
574
FILTER_V sse2,   8, 8
575

    
576
cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8
577
    shl      r6d, 4
578
%ifdef PIC
579
    lea      r11, [fourtap_filter_hb_m]
580
%endif
581
    mova      m5, [fourtap_filter_hb+r6-16]
582
    mova      m6, [fourtap_filter_hb+r6]
583
    mova      m7, [pw_64]
584

    
585
    ; read 3 lines
586
    sub       r2, r3
587
    movh      m0, [r2]
588
    movh      m1, [r2+  r3]
589
    movh      m2, [r2+2*r3]
590
    add       r2, r3
591

    
592
.nextrow
593
    movh      m3, [r2+2*r3]                ; read new row
594
    mova      m4, m0
595
    mova      m0, m1
596
    punpcklbw m4, m3
597
    punpcklbw m1, m2
598
    pmaddubsw m4, m5
599
    pmaddubsw m1, m6
600
    paddsw    m4, m1
601
    mova      m1, m2
602
    paddsw    m4, m7
603
    mova      m2, m3
604
    psraw     m4, 7
605
    packuswb  m4, m4
606
    movh    [r0], m4
607

    
608
    ; go to next line
609
    add        r0, r1
610
    add        r2, r3
611
    dec        r4                          ; next row
612
    jg .nextrow
613
    REP_RET
614

    
615
cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8
616
    lea      r6d, [r6*3]
617
%ifdef PIC
618
    lea      r11, [sixtap_filter_hb_m]
619
%endif
620
    lea       r6, [sixtap_filter_hb+r6*8]
621

    
622
    ; read 5 lines
623
    sub       r2, r3
624
    sub       r2, r3
625
    movh      m0, [r2]
626
    movh      m1, [r2+r3]
627
    movh      m2, [r2+r3*2]
628
    lea       r2, [r2+r3*2]
629
    add       r2, r3
630
    movh      m3, [r2]
631
    movh      m4, [r2+r3]
632

    
633
.nextrow
634
    movh      m5, [r2+2*r3]                ; read new row
635
    mova      m6, m0
636
    punpcklbw m6, m5
637
    mova      m0, m1
638
    punpcklbw m1, m2
639
    mova      m7, m3
640
    punpcklbw m7, m4
641
    pmaddubsw m6, [r6-48]
642
    pmaddubsw m1, [r6-32]
643
    pmaddubsw m7, [r6-16]
644
    paddsw    m6, m1
645
    paddsw    m6, m7
646
    mova      m1, m2
647
    paddsw    m6, [pw_64]
648
    mova      m2, m3
649
    psraw     m6, 7
650
    mova      m3, m4
651
    packuswb  m6, m6
652
    mova      m4, m5
653
    movh    [r0], m6
654

    
655
    ; go to next line
656
    add        r0, r1
657
    add        r2, r3
658
    dec        r4                          ; next row
659
    jg .nextrow
660
    REP_RET
661

    
662
%macro FILTER_BILINEAR 3
663
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
664
    mov      r5d, 8*16
665
    shl      r6d, 4
666
    sub      r5d, r6d
667
%ifdef PIC
668
    lea      r11, [bilinear_filter_vw_m]
669
%endif
670
    pxor      m6, m6
671
    mova      m4, [bilinear_filter_vw+r5-16]
672
    mova      m5, [bilinear_filter_vw+r6-16]
673
.nextrow
674
    movh      m0, [r2+r3*0]
675
    movh      m1, [r2+r3*1]
676
    movh      m3, [r2+r3*2]
677
    punpcklbw m0, m6
678
    punpcklbw m1, m6
679
    punpcklbw m3, m6
680
    mova      m2, m1
681
    pmullw    m0, m4
682
    pmullw    m1, m5
683
    pmullw    m2, m4
684
    pmullw    m3, m5
685
    paddsw    m0, m1
686
    paddsw    m2, m3
687
    psraw     m0, 2
688
    psraw     m2, 2
689
    pavgw     m0, m6
690
    pavgw     m2, m6
691
%ifidn %1, mmxext
692
    packuswb  m0, m0
693
    packuswb  m2, m2
694
    movh [r0+r1*0], m0
695
    movh [r0+r1*1], m2
696
%else
697
    packuswb  m0, m2
698
    movh   [r0+r1*0], m0
699
    movhps [r0+r1*1], m0
700
%endif
701

    
702
    lea       r0, [r0+r1*2]
703
    lea       r2, [r2+r3*2]
704
    sub       r4, 2
705
    jg .nextrow
706
    REP_RET
707

    
708
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
709
    mov      r6d, 8*16
710
    shl      r5d, 4
711
    sub      r6d, r5d
712
%ifdef PIC
713
    lea      r11, [bilinear_filter_vw_m]
714
%endif
715
    pxor      m6, m6
716
    mova      m4, [bilinear_filter_vw+r6-16]
717
    mova      m5, [bilinear_filter_vw+r5-16]
718
.nextrow
719
    movh      m0, [r2+r3*0+0]
720
    movh      m1, [r2+r3*0+1]
721
    movh      m2, [r2+r3*1+0]
722
    movh      m3, [r2+r3*1+1]
723
    punpcklbw m0, m6
724
    punpcklbw m1, m6
725
    punpcklbw m2, m6
726
    punpcklbw m3, m6
727
    pmullw    m0, m4
728
    pmullw    m1, m5
729
    pmullw    m2, m4
730
    pmullw    m3, m5
731
    paddsw    m0, m1
732
    paddsw    m2, m3
733
    psraw     m0, 2
734
    psraw     m2, 2
735
    pavgw     m0, m6
736
    pavgw     m2, m6
737
%ifidn %1, mmxext
738
    packuswb  m0, m0
739
    packuswb  m2, m2
740
    movh [r0+r1*0], m0
741
    movh [r0+r1*1], m2
742
%else
743
    packuswb  m0, m2
744
    movh   [r0+r1*0], m0
745
    movhps [r0+r1*1], m0
746
%endif
747

    
748
    lea       r0, [r0+r1*2]
749
    lea       r2, [r2+r3*2]
750
    sub       r4, 2
751
    jg .nextrow
752
    REP_RET
753
%endmacro
754

    
755
INIT_MMX
756
FILTER_BILINEAR mmxext, 4, 0
757
INIT_XMM
758
FILTER_BILINEAR   sse2, 8, 7
759

    
760
cglobal put_vp8_bilinear8_v_ssse3, 7,7,5
761
    shl      r6d, 4
762
%ifdef PIC
763
    lea      r11, [bilinear_filter_vb_m]
764
%endif
765
    pxor      m4, m4
766
    mova      m3, [bilinear_filter_vb+r6-16]
767
.nextrow
768
    movh      m0, [r2+r3*0]
769
    movh      m1, [r2+r3*1]
770
    movh      m2, [r2+r3*2]
771
    punpcklbw m0, m1
772
    punpcklbw m1, m2
773
    pmaddubsw m0, m3
774
    pmaddubsw m1, m3
775
    psraw     m0, 2
776
    psraw     m1, 2
777
    pavgw     m0, m4
778
    pavgw     m1, m4
779
    packuswb  m0, m1
780
    movh   [r0+r1*0], m0
781
    movhps [r0+r1*1], m0
782

    
783
    lea       r0, [r0+r1*2]
784
    lea       r2, [r2+r3*2]
785
    sub       r4, 2
786
    jg .nextrow
787
    REP_RET
788

    
789
cglobal put_vp8_bilinear8_h_ssse3, 7,7,5
790
    shl      r5d, 4
791
%ifdef PIC
792
    lea      r11, [bilinear_filter_vb_m]
793
%endif
794
    pxor      m4, m4
795
    mova      m2, [filter_h2_shuf]
796
    mova      m3, [bilinear_filter_vb+r5-16]
797
.nextrow
798
    movu      m0, [r2+r3*0]
799
    movu      m1, [r2+r3*1]
800
    pshufb    m0, m2
801
    pshufb    m1, m2
802
    pmaddubsw m0, m3
803
    pmaddubsw m1, m3
804
    psraw     m0, 2
805
    psraw     m1, 2
806
    pavgw     m0, m4
807
    pavgw     m1, m4
808
    packuswb  m0, m1
809
    movh   [r0+r1*0], m0
810
    movhps [r0+r1*1], m0
811

    
812
    lea       r0, [r0+r1*2]
813
    lea       r2, [r2+r3*2]
814
    sub       r4, 2
815
    jg .nextrow
816
    REP_RET
817

    
818
cglobal put_vp8_pixels8_mmx, 5,5
819
.nextrow:
820
    movq  mm0, [r2+r3*0]
821
    movq  mm1, [r2+r3*1]
822
    lea    r2, [r2+r3*2]
823
    movq [r0+r1*0], mm0
824
    movq [r0+r1*1], mm1
825
    lea    r0, [r0+r1*2]
826
    sub   r4d, 2
827
    jg .nextrow
828
    REP_RET
829

    
830
cglobal put_vp8_pixels16_mmx, 5,5
831
.nextrow:
832
    movq  mm0, [r2+r3*0+0]
833
    movq  mm1, [r2+r3*0+8]
834
    movq  mm2, [r2+r3*1+0]
835
    movq  mm3, [r2+r3*1+8]
836
    lea    r2, [r2+r3*2]
837
    movq [r0+r1*0+0], mm0
838
    movq [r0+r1*0+8], mm1
839
    movq [r0+r1*1+0], mm2
840
    movq [r0+r1*1+8], mm3
841
    lea    r0, [r0+r1*2]
842
    sub   r4d, 2
843
    jg .nextrow
844
    REP_RET
845

    
846
cglobal put_vp8_pixels16_sse, 5,5,2
847
.nextrow:
848
    movups xmm0, [r2+r3*0]
849
    movups xmm1, [r2+r3*1]
850
    lea     r2, [r2+r3*2]
851
    movaps [r0+r1*0], xmm0
852
    movaps [r0+r1*1], xmm1
853
    lea     r0, [r0+r1*2]
854
    sub    r4d, 2
855
    jg .nextrow
856
    REP_RET
857

    
858
;-----------------------------------------------------------------------------
859
; IDCT functions:
860
;
861
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
862
;-----------------------------------------------------------------------------
863

    
864
cglobal vp8_idct_dc_add_mmx, 3, 3
865
    ; load data
866
    movd       mm0, [r1]
867

    
868
    ; calculate DC
869
    paddw      mm0, [pw_4]
870
    pxor       mm1, mm1
871
    psraw      mm0, 3
872
    psubw      mm1, mm0
873
    packuswb   mm0, mm0
874
    packuswb   mm1, mm1
875
    punpcklbw  mm0, mm0
876
    punpcklbw  mm1, mm1
877
    punpcklwd  mm0, mm0
878
    punpcklwd  mm1, mm1
879

    
880
    ; add DC
881
    lea         r1, [r0+r2*2]
882
    movd       mm2, [r0]
883
    movd       mm3, [r0+r2]
884
    movd       mm4, [r1]
885
    movd       mm5, [r1+r2]
886
    paddusb    mm2, mm0
887
    paddusb    mm3, mm0
888
    paddusb    mm4, mm0
889
    paddusb    mm5, mm0
890
    psubusb    mm2, mm1
891
    psubusb    mm3, mm1
892
    psubusb    mm4, mm1
893
    psubusb    mm5, mm1
894
    movd      [r0], mm2
895
    movd   [r0+r2], mm3
896
    movd      [r1], mm4
897
    movd   [r1+r2], mm5
898
    RET
899

    
900
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
901
    ; load data
902
    movd       xmm0, [r1]
903
    lea          r1, [r0+r2*2]
904
    pxor       xmm1, xmm1
905
    movq       xmm2, [pw_4]
906

    
907
    ; calculate DC
908
    paddw      xmm0, xmm2
909
    movd       xmm2, [r0]
910
    movd       xmm3, [r0+r2]
911
    movd       xmm4, [r1]
912
    movd       xmm5, [r1+r2]
913
    psraw      xmm0, 3
914
    pshuflw    xmm0, xmm0, 0
915
    punpcklqdq xmm0, xmm0
916
    punpckldq  xmm2, xmm3
917
    punpckldq  xmm4, xmm5
918
    punpcklbw  xmm2, xmm1
919
    punpcklbw  xmm4, xmm1
920
    paddw      xmm2, xmm0
921
    paddw      xmm4, xmm0
922
    packuswb   xmm2, xmm4
923
    movd       [r0], xmm2
924
    pextrd  [r0+r2], xmm2, 1
925
    pextrd     [r1], xmm2, 2
926
    pextrd  [r1+r2], xmm2, 3
927
    RET
928

    
929
;-----------------------------------------------------------------------------
930
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
931
;-----------------------------------------------------------------------------
932

    
933
; calculate %1=%2+%1; %2=%2-%1, with %3=temp register
934
%macro SUMSUB 3
935
    mova      %3, %1
936
    paddw     %1, %2
937
    psubw     %2, %3
938
%endmacro
939

    
940
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
941
;           this macro assumes that m6/m7 have words for 20091/17734 loaded
942
%macro VP8_MULTIPLY_SUMSUB 4
943
    mova      %3, %1
944
    mova      %4, %2
945
    pmulhw    %3, m6 ;20091(1)
946
    pmulhw    %4, m6 ;20091(2)
947
    paddw     %3, %1
948
    paddw     %4, %2
949
    psllw     %1, 1
950
    psllw     %2, 1
951
    pmulhw    %1, m7 ;35468(1)
952
    pmulhw    %2, m7 ;35468(2)
953
    psubw     %1, %4
954
    paddw     %2, %3
955
%endmacro
956

    
957
; calculate x0=%1+%3; x1=%1-%3
958
;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
959
;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
960
;           %5/%6 are temporary registers
961
;           we assume m6/m7 have constant words 20091/17734 loaded in them
962
%macro VP8_IDCT_TRANSFORM4x4_1D 6
963
    SUMSUB_BA           m%3, m%1, m%5     ;t0, t1
964
    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
965
    SUMSUB_BA           m%4, m%3, m%5     ;tmp0, tmp3
966
    SUMSUB_BA           m%2, m%1, m%5     ;tmp1, tmp2
967
    SWAP                 %4,  %1
968
    SWAP                 %4,  %3
969
%endmacro
970

    
971
; transpose a 4x4 table
972
%macro TRANSPOSE4x4 5 ; output in %1/%4/%5/%3
973
    mova      m%5, m%1
974
    punpcklwd m%1, m%2
975
    punpckhwd m%5, m%2
976
    mova      m%2, m%3
977
    punpcklwd m%3, m%4
978
    punpckhwd m%2, m%4
979
    mova      m%4, m%1
980
    punpckldq m%1, m%3 ;col0
981
    punpckhdq m%4, m%3 ;col1
982
    mova      m%3, m%5
983
    punpckldq m%5, m%2 ;col2
984
    punpckhdq m%3, m%2 ;col3
985
    SWAP       %4,  %2
986
    SWAP       %4,  %5
987
    SWAP       %4,  %3
988
%endmacro
989

    
990
INIT_MMX
991
cglobal vp8_idct_add_mmx, 3, 3
992
    ; load block data
993
    movq         m0, [r1]
994
    movq         m1, [r1+8]
995
    movq         m2, [r1+16]
996
    movq         m3, [r1+24]
997
    movq         m6, [pw_20091]
998
    movq         m7, [pw_17734]
999

    
1000
    ; actual IDCT
1001
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1002
    TRANSPOSE4x4W            0, 1, 2, 3, 4
1003
    paddw        m0, [pw_4]
1004
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1005
    TRANSPOSE4x4W            0, 1, 2, 3, 4
1006

    
1007
    ; store
1008
    pxor         m4, m4
1009
    lea          r1, [r0+2*r2]
1010
    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
1011
    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
1012

    
1013
    RET
1014

    
1015
;-----------------------------------------------------------------------------
1016
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
1017
;-----------------------------------------------------------------------------
1018

    
1019
%macro SCATTER_WHT 1
1020
    pextrw r1d, m0, %1
1021
    pextrw r2d, m1, %1
1022
    mov [r0+2*16*0], r1w
1023
    mov [r0+2*16*1], r2w
1024
    pextrw r1d, m2, %1
1025
    pextrw r2d, m3, %1
1026
    mov [r0+2*16*2], r1w
1027
    mov [r0+2*16*3], r2w
1028
%endmacro
1029

    
1030
%macro HADAMARD4_1D 4
1031
    SUMSUB_BADC m%2, m%1, m%4, m%3
1032
    SUMSUB_BADC m%4, m%2, m%3, m%1
1033
    SWAP %1, %4, %3
1034
%endmacro
1035

    
1036
INIT_MMX
1037
cglobal vp8_luma_dc_wht_mmxext, 2,3
1038
    movq          m0, [r1]
1039
    movq          m1, [r1+8]
1040
    movq          m2, [r1+16]
1041
    movq          m3, [r1+24]
1042
    HADAMARD4_1D  0, 1, 2, 3
1043
    TRANSPOSE4x4W 0, 1, 2, 3, 4
1044
    paddw         m0, [pw_3]
1045
    HADAMARD4_1D  0, 1, 2, 3
1046
    psraw         m0, 3
1047
    psraw         m1, 3
1048
    psraw         m2, 3
1049
    psraw         m3, 3
1050
    SCATTER_WHT   0
1051
    add           r0, 2*16*4
1052
    SCATTER_WHT   1
1053
    add           r0, 2*16*4
1054
    SCATTER_WHT   2
1055
    add           r0, 2*16*4
1056
    SCATTER_WHT   3
1057
    RET