Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / vp8dsp.asm @ 004cda8e

History | View | Annotate | Download (25.8 KB)

1
;******************************************************************************
2
;* VP8 MMXEXT optimizations
3
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5
;*
6
;* This file is part of FFmpeg.
7
;*
8
;* FFmpeg is free software; you can redistribute it and/or
9
;* modify it under the terms of the GNU Lesser General Public
10
;* License as published by the Free Software Foundation; either
11
;* version 2.1 of the License, or (at your option) any later version.
12
;*
13
;* FFmpeg is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
;* Lesser General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU Lesser General Public
19
;* License along with FFmpeg; if not, write to the Free Software
20
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
;******************************************************************************
22

    
23
%include "x86inc.asm"
24
%include "x86util.asm"
25

    
26
SECTION_RODATA
27

    
28
fourtap_filter_hw_m: times 4 dw  -6, 123
29
                     times 4 dw  12,  -1
30
                     times 4 dw  -9,  93
31
                     times 4 dw  50,  -6
32
                     times 4 dw  -6,  50
33
                     times 4 dw  93,  -9
34
                     times 4 dw  -1,  12
35
                     times 4 dw 123,  -6
36

    
37
sixtap_filter_hw_m:  times 4 dw   2, -11
38
                     times 4 dw 108,  36
39
                     times 4 dw  -8,   1
40
                     times 4 dw   3, -16
41
                     times 4 dw  77,  77
42
                     times 4 dw -16,   3
43
                     times 4 dw   1,  -8
44
                     times 4 dw  36, 108
45
                     times 4 dw -11,   2
46

    
47
fourtap_filter_hb_m: times 8 db  -6,  -1
48
                     times 8 db 123,  12
49
                     times 8 db  -9,  -6
50
                     times 8 db  93,  50
51
                     times 8 db  -6,  -9
52
                     times 8 db  50,  93
53
                     times 8 db  -1,  -6
54
                     times 8 db  12, 123
55

    
56
sixtap_filter_hb_m:  times 8 db   2,   1
57
                     times 8 db -11, 108
58
                     times 8 db  36,  -8
59
                     times 8 db   3,   3
60
                     times 8 db -16,  77
61
                     times 8 db  77, -16
62
                     times 8 db   1,   2
63
                     times 8 db  -8,  36
64
                     times 8 db 108, -11
65

    
66
fourtap_filter_v_m:  times 8 dw  -6
67
                     times 8 dw 123
68
                     times 8 dw  12
69
                     times 8 dw  -1
70
                     times 8 dw  -9
71
                     times 8 dw  93
72
                     times 8 dw  50
73
                     times 8 dw  -6
74
                     times 8 dw  -6
75
                     times 8 dw  50
76
                     times 8 dw  93
77
                     times 8 dw  -9
78
                     times 8 dw  -1
79
                     times 8 dw  12
80
                     times 8 dw 123
81
                     times 8 dw  -6
82

    
83
sixtap_filter_v_m:   times 8 dw   2
84
                     times 8 dw -11
85
                     times 8 dw 108
86
                     times 8 dw  36
87
                     times 8 dw  -8
88
                     times 8 dw   1
89
                     times 8 dw   3
90
                     times 8 dw -16
91
                     times 8 dw  77
92
                     times 8 dw  77
93
                     times 8 dw -16
94
                     times 8 dw   3
95
                     times 8 dw   1
96
                     times 8 dw  -8
97
                     times 8 dw  36
98
                     times 8 dw 108
99
                     times 8 dw -11
100
                     times 8 dw   2
101

    
102
bilinear_filter_vw_m: times 8 dw 1
103
                      times 8 dw 2
104
                      times 8 dw 3
105
                      times 8 dw 4
106
                      times 8 dw 5
107
                      times 8 dw 6
108
                      times 8 dw 7
109

    
110
bilinear_filter_vb_m: times 8 db 7, 1
111
                      times 8 db 6, 2
112
                      times 8 db 5, 3
113
                      times 8 db 4, 4
114
                      times 8 db 3, 5
115
                      times 8 db 2, 6
116
                      times 8 db 1, 7
117

    
118
%ifdef PIC
119
%define fourtap_filter_hw    r11
120
%define sixtap_filter_hw     r11
121
%define fourtap_filter_hb    r11
122
%define sixtap_filter_hb     r11
123
%define fourtap_filter_v     r11
124
%define sixtap_filter_v      r11
125
%define bilinear_filter_vw   r11
126
%define bilinear_filter_vb   r11
127
%else
128
%define fourtap_filter_hw fourtap_filter_hw_m
129
%define sixtap_filter_hw  sixtap_filter_hw_m
130
%define fourtap_filter_hb fourtap_filter_hb_m
131
%define sixtap_filter_hb  sixtap_filter_hb_m
132
%define fourtap_filter_v  fourtap_filter_v_m
133
%define sixtap_filter_v   sixtap_filter_v_m
134
%define bilinear_filter_vw bilinear_filter_vw_m
135
%define bilinear_filter_vb bilinear_filter_vb_m
136
%endif
137

    
138
filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
139
filter_h4_shuf:  db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5,  8, 6,  9,  7, 10
140

    
141
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
142
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
143
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
144

    
145
cextern pw_3
146
cextern pw_4
147
cextern pw_64
148

    
149
SECTION .text
150

    
151
;-----------------------------------------------------------------------------
152
; subpel MC functions:
153
;
154
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
155
;                                              uint8_t *src, int srcstride,
156
;                                              int height,   int mx, int my);
157
;-----------------------------------------------------------------------------
158

    
159
; 4x4 block, H-only 4-tap filter
160
cglobal put_vp8_epel4_h4_mmxext, 6, 6
161
    shl       r5d, 4
162
%ifdef PIC
163
    lea       r11, [fourtap_filter_hw_m]
164
%endif
165
    movq      mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
166
    movq      mm5, [fourtap_filter_hw+r5]
167
    movq      mm7, [pw_64]
168
    pxor      mm6, mm6
169

    
170
.nextrow
171
    movq      mm1, [r2-1]                  ; (ABCDEFGH) load 8 horizontal pixels
172

    
173
    ; first set of 2 pixels
174
    movq      mm2, mm1                     ; byte ABCD..
175
    punpcklbw mm1, mm6                     ; byte->word ABCD
176
    pshufw    mm0, mm2, 9                  ; byte CDEF..
177
    punpcklbw mm0, mm6                     ; byte->word CDEF
178
    pshufw    mm3, mm1, 0x94               ; word ABBC
179
    pshufw    mm1, mm0, 0x94               ; word CDDE
180
    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
181
    movq      mm0, mm1                     ; backup for second set of pixels
182
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
183
    paddd     mm3, mm1                     ; finish 1st 2px
184

    
185
    ; second set of 2 pixels, use backup of above
186
    punpckhbw mm2, mm6                     ; byte->word EFGH
187
    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
188
    pshufw    mm1, mm2, 0x94               ; word EFFG
189
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
190
    paddd     mm0, mm1                     ; finish 2nd 2px
191

    
192
    ; merge two sets of 2 pixels into one set of 4, round/clip/store
193
    packssdw  mm3, mm0                     ; merge dword->word (4px)
194
    paddsw    mm3, mm7                     ; rounding
195
    psraw     mm3, 7
196
    packuswb  mm3, mm6                     ; clip and word->bytes
197
    movd     [r0], mm3                     ; store
198

    
199
    ; go to next line
200
    add        r0, r1
201
    add        r2, r3
202
    dec        r4                          ; next row
203
    jg .nextrow
204
    REP_RET
205

    
206
; 4x4 block, H-only 6-tap filter
207
cglobal put_vp8_epel4_h6_mmxext, 6, 6
208
    lea       r5d, [r5*3]
209
%ifdef PIC
210
    lea       r11, [sixtap_filter_hw_m]
211
%endif
212
    movq      mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
213
    movq      mm5, [sixtap_filter_hw+r5*8-32]
214
    movq      mm6, [sixtap_filter_hw+r5*8-16]
215
    movq      mm7, [pw_64]
216
    pxor      mm3, mm3
217

    
218
.nextrow
219
    movq      mm1, [r2-2]                  ; (ABCDEFGH) load 8 horizontal pixels
220

    
221
    ; first set of 2 pixels
222
    movq      mm2, mm1                     ; byte ABCD..
223
    punpcklbw mm1, mm3                     ; byte->word ABCD
224
    pshufw    mm0, mm2, 0x9                ; byte CDEF..
225
    punpckhbw mm2, mm3                     ; byte->word EFGH
226
    punpcklbw mm0, mm3                     ; byte->word CDEF
227
    pshufw    mm1, mm1, 0x94               ; word ABBC
228
    pshufw    mm2, mm2, 0x94               ; word EFFG
229
    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
230
    pshufw    mm3, mm0, 0x94               ; word CDDE
231
    movq      mm0, mm3                     ; backup for second set of pixels
232
    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
233
    paddd     mm1, mm3                     ; add to 1st 2px cache
234
    movq      mm3, mm2                     ; backup for second set of pixels
235
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
236
    paddd     mm1, mm2                     ; finish 1st 2px
237

    
238
    ; second set of 2 pixels, use backup of above
239
    movd      mm2, [r2+3]                  ; byte FGHI (prevent overreads)
240
    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
241
    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
242
    paddd     mm0, mm3                     ; add to 2nd 2px cache
243
    pxor      mm3, mm3
244
    punpcklbw mm2, mm3                     ; byte->word FGHI
245
    pshufw    mm2, mm2, 0xE9               ; word GHHI
246
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
247
    paddd     mm0, mm2                     ; finish 2nd 2px
248

    
249
    ; merge two sets of 2 pixels into one set of 4, round/clip/store
250
    packssdw  mm1, mm0                     ; merge dword->word (4px)
251
    paddsw    mm1, mm7                     ; rounding
252
    psraw     mm1, 7
253
    packuswb  mm1, mm3                     ; clip and word->bytes
254
    movd     [r0], mm1                     ; store
255

    
256
    ; go to next line
257
    add        r0, r1
258
    add        r2, r3
259
    dec        r4                          ; next row
260
    jg .nextrow
261
    REP_RET
262

    
263
; 4x4 block, H-only 4-tap filter
264
INIT_XMM
265
cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
266
    shl      r5d, 4
267
%ifdef PIC
268
    lea      r11, [fourtap_filter_hw_m]
269
%endif
270
    mova      m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
271
    mova      m6, [fourtap_filter_hw+r5]
272
    pxor      m7, m7
273

    
274
.nextrow
275
    movh      m0, [r2-1]
276
    punpcklbw m0, m7        ; ABCDEFGH
277
    mova      m1, m0
278
    mova      m2, m0
279
    mova      m3, m0
280
    psrldq    m1, 2         ; BCDEFGH
281
    psrldq    m2, 4         ; CDEFGH
282
    psrldq    m3, 6         ; DEFGH
283
    punpcklwd m0, m1        ; ABBCCDDE
284
    punpcklwd m2, m3        ; CDDEEFFG
285
    pmaddwd   m0, m5
286
    pmaddwd   m2, m6
287
    paddd     m0, m2
288

    
289
    movh      m1, [r2+3]
290
    punpcklbw m1, m7        ; ABCDEFGH
291
    mova      m2, m1
292
    mova      m3, m1
293
    mova      m4, m1
294
    psrldq    m2, 2         ; BCDEFGH
295
    psrldq    m3, 4         ; CDEFGH
296
    psrldq    m4, 6         ; DEFGH
297
    punpcklwd m1, m2        ; ABBCCDDE
298
    punpcklwd m3, m4        ; CDDEEFFG
299
    pmaddwd   m1, m5
300
    pmaddwd   m3, m6
301
    paddd     m1, m3
302

    
303
    packssdw  m0, m1
304
    paddsw    m0, [pw_64]
305
    psraw     m0, 7
306
    packuswb  m0, m7
307
    movh    [r0], m0        ; store
308

    
309
    ; go to next line
310
    add       r0, r1
311
    add       r2, r3
312
    dec       r4            ; next row
313
    jg .nextrow
314
    REP_RET
315

    
316
cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
317
    lea      r5d, [r5*3]
318
%ifdef PIC
319
    lea      r11, [sixtap_filter_hw_m]
320
%endif
321
    lea       r5, [sixtap_filter_hw+r5*8]
322
    pxor      m7, m7
323

    
324
.nextrow
325
    movu      m0, [r2-2]
326
    mova      m6, m0
327
    mova      m4, m0
328
    punpcklbw m0, m7        ; ABCDEFGHI
329
    mova      m1, m0
330
    mova      m2, m0
331
    mova      m3, m0
332
    psrldq    m1, 2         ; BCDEFGH
333
    psrldq    m2, 4         ; CDEFGH
334
    psrldq    m3, 6         ; DEFGH
335
    psrldq    m4, 4
336
    punpcklbw m4, m7        ; EFGH
337
    mova      m5, m4
338
    psrldq    m5, 2         ; FGH
339
    punpcklwd m0, m1        ; ABBCCDDE
340
    punpcklwd m2, m3        ; CDDEEFFG
341
    punpcklwd m4, m5        ; EFFGGHHI
342
    pmaddwd   m0, [r5-48]
343
    pmaddwd   m2, [r5-32]
344
    pmaddwd   m4, [r5-16]
345
    paddd     m0, m2
346
    paddd     m0, m4
347

    
348
    psrldq    m6, 4
349
    mova      m4, m6
350
    punpcklbw m6, m7        ; ABCDEFGHI
351
    mova      m1, m6
352
    mova      m2, m6
353
    mova      m3, m6
354
    psrldq    m1, 2         ; BCDEFGH
355
    psrldq    m2, 4         ; CDEFGH
356
    psrldq    m3, 6         ; DEFGH
357
    psrldq    m4, 4
358
    punpcklbw m4, m7        ; EFGH
359
    mova      m5, m4
360
    psrldq    m5, 2         ; FGH
361
    punpcklwd m6, m1        ; ABBCCDDE
362
    punpcklwd m2, m3        ; CDDEEFFG
363
    punpcklwd m4, m5        ; EFFGGHHI
364
    pmaddwd   m6, [r5-48]
365
    pmaddwd   m2, [r5-32]
366
    pmaddwd   m4, [r5-16]
367
    paddd     m6, m2
368
    paddd     m6, m4
369

    
370
    packssdw  m0, m6
371
    paddsw    m0, [pw_64]
372
    psraw     m0, 7
373
    packuswb  m0, m7
374
    movh    [r0], m0        ; store
375

    
376
    ; go to next line
377
    add       r0, r1
378
    add       r2, r3
379
    dec       r4            ; next row
380
    jg .nextrow
381
    REP_RET
382

    
383
cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
384
    shl      r5d, 4
385
    mova      m2, [pw_64]
386
    mova      m3, [filter_h4_shuf]
387
    mova      m4, [filter_h6_shuf2]
388
%ifdef PIC
389
    lea      r11, [fourtap_filter_hb_m]
390
%endif
391
    mova      m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
392
    mova      m6, [fourtap_filter_hb+r5]
393

    
394
.nextrow
395
    movu      m0, [r2-1]
396
    mova      m1, m0
397
    pshufb    m0, m3
398
    pshufb    m1, m4
399
    pmaddubsw m0, m5
400
    pmaddubsw m1, m6
401
    paddsw    m0, m2
402
    paddsw    m0, m1
403
    psraw     m0, 7
404
    packuswb  m0, m0
405
    movh    [r0], m0        ; store
406

    
407
    ; go to next line
408
    add       r0, r1
409
    add       r2, r3
410
    dec       r4            ; next row
411
    jg .nextrow
412
    REP_RET
413

    
414
cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
415
    lea      r5d, [r5*3]
416
    mova      m3, [filter_h6_shuf1]
417
    mova      m4, [filter_h6_shuf2]
418
%ifdef PIC
419
    lea      r11, [sixtap_filter_hb_m]
420
%endif
421
    mova      m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
422
    mova      m6, [sixtap_filter_hb+r5*8-32]
423
    mova      m7, [sixtap_filter_hb+r5*8-16]
424

    
425
.nextrow
426
    movu      m0, [r2-2]
427
    mova      m1, m0
428
    mova      m2, m0
429
    pshufb    m0, m3
430
    pshufb    m1, m4
431
    pshufb    m2, [filter_h6_shuf3]
432
    pmaddubsw m0, m5
433
    pmaddubsw m1, m6
434
    pmaddubsw m2, m7
435
    paddsw    m0, m1
436
    paddsw    m0, m2
437
    paddsw    m0, [pw_64]
438
    psraw     m0, 7
439
    packuswb  m0, m0
440
    movh    [r0], m0        ; store
441

    
442
    ; go to next line
443
    add       r0, r1
444
    add       r2, r3
445
    dec       r4            ; next row
446
    jg .nextrow
447
    REP_RET
448

    
449
%macro FILTER_V 3
450
; 4x4 block, V-only 4-tap filter
451
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
452
    shl      r6d, 5
453
%ifdef PIC
454
    lea      r11, [fourtap_filter_v_m]
455
%endif
456
    lea       r6, [fourtap_filter_v+r6-32]
457
    mova      m6, [pw_64]
458
    pxor      m7, m7
459
    mova      m5, [r6+48]
460

    
461
    ; read 3 lines
462
    sub       r2, r3
463
    movh      m0, [r2]
464
    movh      m1, [r2+  r3]
465
    movh      m2, [r2+2*r3]
466
    add       r2, r3
467
    punpcklbw m0, m7
468
    punpcklbw m1, m7
469
    punpcklbw m2, m7
470

    
471
.nextrow
472
    ; first calculate negative taps (to prevent losing positive overflows)
473
    movh      m4, [r2+2*r3]                ; read new row
474
    punpcklbw m4, m7
475
    mova      m3, m4
476
    pmullw    m0, [r6+0]
477
    pmullw    m4, m5
478
    paddsw    m4, m0
479

    
480
    ; then calculate positive taps
481
    mova      m0, m1
482
    pmullw    m1, [r6+16]
483
    paddsw    m4, m1
484
    mova      m1, m2
485
    pmullw    m2, [r6+32]
486
    paddsw    m4, m2
487
    mova      m2, m3
488

    
489
    ; round/clip/store
490
    paddsw    m4, m6
491
    psraw     m4, 7
492
    packuswb  m4, m7
493
    movh    [r0], m4
494

    
495
    ; go to next line
496
    add       r0, r1
497
    add       r2, r3
498
    dec       r4                           ; next row
499
    jg .nextrow
500
    REP_RET
501

    
502

    
503
; 4x4 block, V-only 6-tap filter
504
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
505
    shl      r6d, 4
506
    lea       r6, [r6*3]
507
%ifdef PIC
508
    lea      r11, [sixtap_filter_v_m]
509
%endif
510
    lea       r6, [sixtap_filter_v+r6-96]
511
    pxor      m7, m7
512

    
513
    ; read 5 lines
514
    sub       r2, r3
515
    sub       r2, r3
516
    movh      m0, [r2]
517
    movh      m1, [r2+r3]
518
    movh      m2, [r2+r3*2]
519
    lea       r2, [r2+r3*2]
520
    add       r2, r3
521
    movh      m3, [r2]
522
    movh      m4, [r2+r3]
523
    punpcklbw m0, m7
524
    punpcklbw m1, m7
525
    punpcklbw m2, m7
526
    punpcklbw m3, m7
527
    punpcklbw m4, m7
528

    
529
.nextrow
530
    ; first calculate negative taps (to prevent losing positive overflows)
531
    mova      m5, m1
532
    pmullw    m5, [r6+16]
533
    mova      m6, m4
534
    pmullw    m6, [r6+64]
535
    paddsw    m6, m5
536

    
537
    ; then calculate positive taps
538
    movh      m5, [r2+2*r3]                ; read new row
539
    punpcklbw m5, m7
540
    pmullw    m0, [r6+0]
541
    paddsw    m6, m0
542
    mova      m0, m1
543
    mova      m1, m2
544
    pmullw    m2, [r6+32]
545
    paddsw    m6, m2
546
    mova      m2, m3
547
    pmullw    m3, [r6+48]
548
    paddsw    m6, m3
549
    mova      m3, m4
550
    mova      m4, m5
551
    pmullw    m5, [r6+80]
552
    paddsw    m6, m5
553

    
554
    ; round/clip/store
555
    paddsw    m6, [pw_64]
556
    psraw     m6, 7
557
    packuswb  m6, m7
558
    movh    [r0], m6
559

    
560
    ; go to next line
561
    add       r0, r1
562
    add       r2, r3
563
    dec       r4                           ; next row
564
    jg .nextrow
565
    REP_RET
566
%endmacro
567

    
568
INIT_MMX
569
FILTER_V mmxext, 4, 0
570
INIT_XMM
571
FILTER_V sse2,   8, 8
572

    
573
cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8
574
    shl      r6d, 4
575
%ifdef PIC
576
    lea      r11, [fourtap_filter_hb_m]
577
%endif
578
    mova      m5, [fourtap_filter_hb+r6-16]
579
    mova      m6, [fourtap_filter_hb+r6]
580
    mova      m7, [pw_64]
581

    
582
    ; read 3 lines
583
    sub       r2, r3
584
    movh      m0, [r2]
585
    movh      m1, [r2+  r3]
586
    movh      m2, [r2+2*r3]
587
    add       r2, r3
588

    
589
.nextrow
590
    movh      m3, [r2+2*r3]                ; read new row
591
    mova      m4, m0
592
    mova      m0, m1
593
    punpcklbw m4, m3
594
    punpcklbw m1, m2
595
    pmaddubsw m4, m5
596
    pmaddubsw m1, m6
597
    paddsw    m4, m1
598
    mova      m1, m2
599
    paddsw    m4, m7
600
    mova      m2, m3
601
    psraw     m4, 7
602
    packuswb  m4, m4
603
    movh    [r0], m4
604

    
605
    ; go to next line
606
    add        r0, r1
607
    add        r2, r3
608
    dec        r4                          ; next row
609
    jg .nextrow
610
    REP_RET
611

    
612
cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8
613
    lea      r6d, [r6*3]
614
%ifdef PIC
615
    lea      r11, [sixtap_filter_hb_m]
616
%endif
617
    lea       r6, [sixtap_filter_hb+r6*8]
618

    
619
    ; read 5 lines
620
    sub       r2, r3
621
    sub       r2, r3
622
    movh      m0, [r2]
623
    movh      m1, [r2+r3]
624
    movh      m2, [r2+r3*2]
625
    lea       r2, [r2+r3*2]
626
    add       r2, r3
627
    movh      m3, [r2]
628
    movh      m4, [r2+r3]
629

    
630
.nextrow
631
    movh      m5, [r2+2*r3]                ; read new row
632
    mova      m6, m0
633
    punpcklbw m6, m5
634
    mova      m0, m1
635
    punpcklbw m1, m2
636
    mova      m7, m3
637
    punpcklbw m7, m4
638
    pmaddubsw m6, [r6-48]
639
    pmaddubsw m1, [r6-32]
640
    pmaddubsw m7, [r6-16]
641
    paddsw    m6, m1
642
    paddsw    m6, m7
643
    mova      m1, m2
644
    paddsw    m6, [pw_64]
645
    mova      m2, m3
646
    psraw     m6, 7
647
    mova      m3, m4
648
    packuswb  m6, m6
649
    mova      m4, m5
650
    movh    [r0], m6
651

    
652
    ; go to next line
653
    add        r0, r1
654
    add        r2, r3
655
    dec        r4                          ; next row
656
    jg .nextrow
657
    REP_RET
658

    
659
%macro FILTER_BILINEAR 3
660
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
661
    mov      r5d, 8*16
662
    shl      r6d, 4
663
    sub      r5d, r6d
664
%ifdef PIC
665
    lea      r11, [bilinear_filter_vw_m]
666
%endif
667
    pxor      m6, m6
668
    mova      m4, [bilinear_filter_vw+r5-16]
669
    mova      m5, [bilinear_filter_vw+r6-16]
670
.nextrow
671
    movh      m0, [r2+r3*0]
672
    movh      m1, [r2+r3*1]
673
    movh      m3, [r2+r3*2]
674
    punpcklbw m0, m6
675
    punpcklbw m1, m6
676
    punpcklbw m3, m6
677
    mova      m2, m1
678
    pmullw    m0, m4
679
    pmullw    m1, m5
680
    pmullw    m2, m4
681
    pmullw    m3, m5
682
    paddsw    m0, m1
683
    paddsw    m2, m3
684
    psraw     m0, 2
685
    psraw     m2, 2
686
    pavgw     m0, m6
687
    pavgw     m2, m6
688
%ifidn %1, mmxext
689
    packuswb  m0, m0
690
    packuswb  m2, m2
691
    movh [r0+r1*0], m0
692
    movh [r0+r1*1], m2
693
%else
694
    packuswb  m0, m2
695
    movh   [r0+r1*0], m0
696
    movhps [r0+r1*1], m0
697
%endif
698

    
699
    lea       r0, [r0+r1*2]
700
    lea       r2, [r2+r3*2]
701
    sub       r4, 2
702
    jg .nextrow
703
    REP_RET
704

    
705
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
706
    mov      r6d, 8*16
707
    shl      r5d, 4
708
    sub      r6d, r5d
709
%ifdef PIC
710
    lea      r11, [bilinear_filter_vw_m]
711
%endif
712
    pxor      m6, m6
713
    mova      m4, [bilinear_filter_vw+r6-16]
714
    mova      m5, [bilinear_filter_vw+r5-16]
715
.nextrow
716
    movh      m0, [r2+r3*0+0]
717
    movh      m1, [r2+r3*0+1]
718
    movh      m2, [r2+r3*1+0]
719
    movh      m3, [r2+r3*1+1]
720
    punpcklbw m0, m6
721
    punpcklbw m1, m6
722
    punpcklbw m2, m6
723
    punpcklbw m3, m6
724
    pmullw    m0, m4
725
    pmullw    m1, m5
726
    pmullw    m2, m4
727
    pmullw    m3, m5
728
    paddsw    m0, m1
729
    paddsw    m2, m3
730
    psraw     m0, 2
731
    psraw     m2, 2
732
    pavgw     m0, m6
733
    pavgw     m2, m6
734
%ifidn %1, mmxext
735
    packuswb  m0, m0
736
    packuswb  m2, m2
737
    movh [r0+r1*0], m0
738
    movh [r0+r1*1], m2
739
%else
740
    packuswb  m0, m2
741
    movh   [r0+r1*0], m0
742
    movhps [r0+r1*1], m0
743
%endif
744

    
745
    lea       r0, [r0+r1*2]
746
    lea       r2, [r2+r3*2]
747
    sub       r4, 2
748
    jg .nextrow
749
    REP_RET
750
%endmacro
751

    
752
INIT_MMX
753
FILTER_BILINEAR mmxext, 4, 0
754
INIT_XMM
755
FILTER_BILINEAR   sse2, 8, 7
756

    
757
cglobal put_vp8_bilinear8_v_ssse3, 7,7,5
758
    shl      r6d, 4
759
%ifdef PIC
760
    lea      r11, [bilinear_filter_vb_m]
761
%endif
762
    pxor      m4, m4
763
    mova      m3, [bilinear_filter_vb+r6-16]
764
.nextrow
765
    movh      m0, [r2+r3*0]
766
    movh      m1, [r2+r3*1]
767
    movh      m2, [r2+r3*2]
768
    punpcklbw m0, m1
769
    punpcklbw m1, m2
770
    pmaddubsw m0, m3
771
    pmaddubsw m1, m3
772
    psraw     m0, 2
773
    psraw     m1, 2
774
    pavgw     m0, m4
775
    pavgw     m1, m4
776
    packuswb  m0, m1
777
    movh   [r0+r1*0], m0
778
    movhps [r0+r1*1], m0
779

    
780
    lea       r0, [r0+r1*2]
781
    lea       r2, [r2+r3*2]
782
    sub       r4, 2
783
    jg .nextrow
784
    REP_RET
785

    
786
cglobal put_vp8_bilinear8_h_ssse3, 7,7,5
787
    shl      r5d, 4
788
%ifdef PIC
789
    lea      r11, [bilinear_filter_vb_m]
790
%endif
791
    pxor      m4, m4
792
    mova      m2, [filter_h2_shuf]
793
    mova      m3, [bilinear_filter_vb+r5-16]
794
.nextrow
795
    movu      m0, [r2+r3*0]
796
    movu      m1, [r2+r3*1]
797
    pshufb    m0, m2
798
    pshufb    m1, m2
799
    pmaddubsw m0, m3
800
    pmaddubsw m1, m3
801
    psraw     m0, 2
802
    psraw     m1, 2
803
    pavgw     m0, m4
804
    pavgw     m1, m4
805
    packuswb  m0, m1
806
    movh   [r0+r1*0], m0
807
    movhps [r0+r1*1], m0
808

    
809
    lea       r0, [r0+r1*2]
810
    lea       r2, [r2+r3*2]
811
    sub       r4, 2
812
    jg .nextrow
813
    REP_RET
814

    
815
cglobal put_vp8_pixels8_mmx, 5,5
816
.nextrow:
817
    movq  mm0, [r2+r3*0]
818
    movq  mm1, [r2+r3*1]
819
    lea    r2, [r2+r3*2]
820
    movq [r0+r1*0], mm0
821
    movq [r0+r1*1], mm1
822
    lea    r0, [r0+r1*2]
823
    sub   r4d, 2
824
    jg .nextrow
825
    REP_RET
826

    
827
cglobal put_vp8_pixels16_mmx, 5,5
828
.nextrow:
829
    movq  mm0, [r2+r3*0+0]
830
    movq  mm1, [r2+r3*0+8]
831
    movq  mm2, [r2+r3*1+0]
832
    movq  mm3, [r2+r3*1+8]
833
    lea    r2, [r2+r3*2]
834
    movq [r0+r1*0+0], mm0
835
    movq [r0+r1*0+8], mm1
836
    movq [r0+r1*1+0], mm2
837
    movq [r0+r1*1+8], mm3
838
    lea    r0, [r0+r1*2]
839
    sub   r4d, 2
840
    jg .nextrow
841
    REP_RET
842

    
843
cglobal put_vp8_pixels16_sse, 5,5,2
844
.nextrow:
845
    movups xmm0, [r2+r3*0]
846
    movups xmm1, [r2+r3*1]
847
    lea     r2, [r2+r3*2]
848
    movaps [r0+r1*0], xmm0
849
    movaps [r0+r1*1], xmm1
850
    lea     r0, [r0+r1*2]
851
    sub    r4d, 2
852
    jg .nextrow
853
    REP_RET
854

    
855
;-----------------------------------------------------------------------------
856
; IDCT functions:
857
;
858
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
859
;-----------------------------------------------------------------------------
860

    
861
cglobal vp8_idct_dc_add_mmx, 3, 3
862
    ; load data
863
    movd       mm0, [r1]
864

    
865
    ; calculate DC
866
    paddw      mm0, [pw_4]
867
    pxor       mm1, mm1
868
    psraw      mm0, 3
869
    psubw      mm1, mm0
870
    packuswb   mm0, mm0
871
    packuswb   mm1, mm1
872
    punpcklbw  mm0, mm0
873
    punpcklbw  mm1, mm1
874
    punpcklwd  mm0, mm0
875
    punpcklwd  mm1, mm1
876

    
877
    ; add DC
878
    lea         r1, [r0+r2*2]
879
    movd       mm2, [r0]
880
    movd       mm3, [r0+r2]
881
    movd       mm4, [r1]
882
    movd       mm5, [r1+r2]
883
    paddusb    mm2, mm0
884
    paddusb    mm3, mm0
885
    paddusb    mm4, mm0
886
    paddusb    mm5, mm0
887
    psubusb    mm2, mm1
888
    psubusb    mm3, mm1
889
    psubusb    mm4, mm1
890
    psubusb    mm5, mm1
891
    movd      [r0], mm2
892
    movd   [r0+r2], mm3
893
    movd      [r1], mm4
894
    movd   [r1+r2], mm5
895
    RET
896

    
897
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
898
    ; load data
899
    movd       xmm0, [r1]
900
    lea          r1, [r0+r2*2]
901
    pxor       xmm1, xmm1
902
    movq       xmm2, [pw_4]
903

    
904
    ; calculate DC
905
    paddw      xmm0, xmm2
906
    movd       xmm2, [r0]
907
    movd       xmm3, [r0+r2]
908
    movd       xmm4, [r1]
909
    movd       xmm5, [r1+r2]
910
    psraw      xmm0, 3
911
    pshuflw    xmm0, xmm0, 0
912
    punpcklqdq xmm0, xmm0
913
    punpckldq  xmm2, xmm3
914
    punpckldq  xmm4, xmm5
915
    punpcklbw  xmm2, xmm1
916
    punpcklbw  xmm4, xmm1
917
    paddw      xmm2, xmm0
918
    paddw      xmm4, xmm0
919
    packuswb   xmm2, xmm4
920
    movd       [r0], xmm2
921
    pextrd  [r0+r2], xmm2, 1
922
    pextrd     [r1], xmm2, 2
923
    pextrd  [r1+r2], xmm2, 3
924
    RET
925

    
926
;-----------------------------------------------------------------------------
927
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
928
;-----------------------------------------------------------------------------
929

    
930
%macro SCATTER_WHT 1
931
    pextrw r1d, m0, %1
932
    pextrw r2d, m1, %1
933
    mov [r0+2*16*0], r1w
934
    mov [r0+2*16*1], r2w
935
    pextrw r1d, m2, %1
936
    pextrw r2d, m3, %1
937
    mov [r0+2*16*2], r1w
938
    mov [r0+2*16*3], r2w
939
%endmacro
940

    
941
%macro HADAMARD4_1D 4
942
    SUMSUB_BADC m%2, m%1, m%4, m%3
943
    SUMSUB_BADC m%4, m%2, m%3, m%1
944
    SWAP %1, %4, %3
945
%endmacro
946

    
947
INIT_MMX
948
cglobal vp8_luma_dc_wht_mmxext, 2,3
949
    movq          m0, [r1]
950
    movq          m1, [r1+8]
951
    movq          m2, [r1+16]
952
    movq          m3, [r1+24]
953
    HADAMARD4_1D  0, 1, 2, 3
954
    TRANSPOSE4x4W 0, 1, 2, 3, 4
955
    paddw         m0, [pw_3]
956
    HADAMARD4_1D  0, 1, 2, 3
957
    psraw         m0, 3
958
    psraw         m1, 3
959
    psraw         m2, 3
960
    psraw         m3, 3
961
    SCATTER_WHT   0
962
    add           r0, 2*16*4
963
    SCATTER_WHT   1
964
    add           r0, 2*16*4
965
    SCATTER_WHT   2
966
    add           r0, 2*16*4
967
    SCATTER_WHT   3
968
    RET