Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_intrapred.asm @ dd68d4db

History | View | Annotate | Download (26.6 KB)

1
;******************************************************************************
2
;* H.264 intra prediction asm optimizations
3
;* Copyright (c) 2010 Jason Garrett-Glaser
4
;*
5
;* This file is part of FFmpeg.
6
;*
7
;* FFmpeg is free software; you can redistribute it and/or
8
;* modify it under the terms of the GNU Lesser General Public
9
;* License as published by the Free Software Foundation; either
10
;* version 2.1 of the License, or (at your option) any later version.
11
;*
12
;* FFmpeg is distributed in the hope that it will be useful,
13
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
;* Lesser General Public License for more details.
16
;*
17
;* You should have received a copy of the GNU Lesser General Public
18
;* License along with FFmpeg; if not, write to the Free Software
19
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
;******************************************************************************
21

    
22
%include "x86inc.asm"
23

    
24
SECTION_RODATA
25

    
26
tm_shuf: times 8 db 0x03, 0x80
27
plane_shuf:  db -8, -7, -6, -5, -4, -3, -2, -1
28
             db  1,  2,  3,  4,  5,  6,  7,  8
29
plane8_shuf: db -4, -3, -2, -1,  0,  0,  0,  0
30
             db  1,  2,  3,  4,  0,  0,  0,  0
31
pw_0to7:     dw  0,  1,  2,  3,  4,  5,  6,  7
32
pw_1to8:     dw  1,  2,  3,  4,  5,  6,  7,  8
33
pw_m8tom1:   dw -8, -7, -6, -5, -4, -3, -2, -1
34
pw_m4to4:    dw -4, -3, -2, -1,  1,  2,  3,  4
35

    
36
SECTION .text
37

    
38
cextern pb_1
39
cextern pb_3
40
cextern pw_5
41
cextern pw_16
42
cextern pw_17
43
cextern pw_32
44

    
45
;-----------------------------------------------------------------------------
46
; void pred16x16_vertical(uint8_t *src, int stride)
47
;-----------------------------------------------------------------------------
48

    
49
cglobal pred16x16_vertical_mmx, 2,3
50
    sub   r0, r1
51
    mov   r2, 8
52
    movq mm0, [r0+0]
53
    movq mm1, [r0+8]
54
.loop:
55
    movq [r0+r1*1+0], mm0
56
    movq [r0+r1*1+8], mm1
57
    movq [r0+r1*2+0], mm0
58
    movq [r0+r1*2+8], mm1
59
    lea   r0, [r0+r1*2]
60
    dec   r2
61
    jg .loop
62
    REP_RET
63

    
64
cglobal pred16x16_vertical_sse, 2,3
65
    sub   r0, r1
66
    mov   r2, 4
67
    movaps xmm0, [r0]
68
.loop:
69
    movaps [r0+r1*1], xmm0
70
    movaps [r0+r1*2], xmm0
71
    lea   r0, [r0+r1*2]
72
    movaps [r0+r1*1], xmm0
73
    movaps [r0+r1*2], xmm0
74
    lea   r0, [r0+r1*2]
75
    dec   r2
76
    jg .loop
77
    REP_RET
78

    
79
;-----------------------------------------------------------------------------
80
; void pred16x16_horizontal(uint8_t *src, int stride)
81
;-----------------------------------------------------------------------------
82

    
83
%macro PRED16x16_H 1
84
cglobal pred16x16_horizontal_%1, 2,3
85
    mov       r2, 8
86
%ifidn %1, ssse3
87
    mova      m2, [pb_3]
88
%endif
89
.loop:
90
    movd      m0, [r0+r1*0-4]
91
    movd      m1, [r0+r1*1-4]
92

    
93
%ifidn %1, ssse3
94
    pshufb    m0, m2
95
    pshufb    m1, m2
96
%else
97
    punpcklbw m0, m0
98
    punpcklbw m1, m1
99
%ifidn %1, mmxext
100
    pshufw    m0, m0, 0xff
101
    pshufw    m1, m1, 0xff
102
%else
103
    punpckhwd m0, m0
104
    punpckhwd m1, m1
105
    punpckhdq m0, m0
106
    punpckhdq m1, m1
107
%endif
108
    mova [r0+r1*0+8], m0
109
    mova [r0+r1*1+8], m1
110
%endif
111

    
112
    mova [r0+r1*0], m0
113
    mova [r0+r1*1], m1
114
    lea       r0, [r0+r1*2]
115
    dec       r2
116
    jg .loop
117
    REP_RET
118
%endmacro
119

    
120
INIT_MMX
121
PRED16x16_H mmx
122
PRED16x16_H mmxext
123
INIT_XMM
124
PRED16x16_H ssse3
125

    
126
;-----------------------------------------------------------------------------
127
; void pred16x16_dc(uint8_t *src, int stride)
128
;-----------------------------------------------------------------------------
129

    
130
%macro PRED16x16_DC 1
131
cglobal pred16x16_dc_%1, 2,7
132
    mov       r4, r0
133
    sub       r0, r1
134
    pxor      mm0, mm0
135
    pxor      mm1, mm1
136
    psadbw    mm0, [r0+0]
137
    psadbw    mm1, [r0+8]
138
    dec        r0
139
    movzx     r5d, byte [r0+r1*1]
140
    paddw     mm0, mm1
141
    movd      r6d, mm0
142
    lea        r0, [r0+r1*2]
143
%rep 7
144
    movzx     r2d, byte [r0+r1*0]
145
    movzx     r3d, byte [r0+r1*1]
146
    add       r5d, r2d
147
    add       r6d, r3d
148
    lea        r0, [r0+r1*2]
149
%endrep
150
    movzx     r2d, byte [r0+r1*0]
151
    add       r5d, r6d
152
    lea       r2d, [r2+r5+16]
153
    shr       r2d, 5
154
%ifidn %1, mmxext
155
    movd       m0, r2d
156
    punpcklbw  m0, m0
157
    pshufw     m0, m0, 0
158
%elifidn %1, sse2
159
    movd       m0, r2d
160
    punpcklbw  m0, m0
161
    pshuflw    m0, m0, 0
162
    punpcklqdq m0, m0
163
%elifidn %1, ssse3
164
    pxor       m1, m1
165
    movd       m0, r2d
166
    pshufb     m0, m1
167
%endif
168

    
169
%if mmsize==8
170
    mov       r3d, 8
171
.loop:
172
    mova [r4+r1*0+0], m0
173
    mova [r4+r1*0+8], m0
174
    mova [r4+r1*1+0], m0
175
    mova [r4+r1*1+8], m0
176
%else
177
    mov       r3d, 4
178
.loop:
179
    mova [r4+r1*0], m0
180
    mova [r4+r1*1], m0
181
    lea   r4, [r4+r1*2]
182
    mova [r4+r1*0], m0
183
    mova [r4+r1*1], m0
184
%endif
185
    lea   r4, [r4+r1*2]
186
    dec   r3d
187
    jg .loop
188
    REP_RET
189
%endmacro
190

    
191
INIT_MMX
192
PRED16x16_DC mmxext
193
INIT_XMM
194
PRED16x16_DC   sse2
195
PRED16x16_DC  ssse3
196

    
197
;-----------------------------------------------------------------------------
198
; void pred16x16_tm_vp8(uint8_t *src, int stride)
199
;-----------------------------------------------------------------------------
200

    
201
%macro PRED16x16_TM_MMX 1
202
cglobal pred16x16_tm_vp8_%1, 2,5
203
    sub        r0, r1
204
    pxor      mm7, mm7
205
    movq      mm0, [r0+0]
206
    movq      mm2, [r0+8]
207
    movq      mm1, mm0
208
    movq      mm3, mm2
209
    punpcklbw mm0, mm7
210
    punpckhbw mm1, mm7
211
    punpcklbw mm2, mm7
212
    punpckhbw mm3, mm7
213
    movzx     r3d, byte [r0-1]
214
    mov       r4d, 16
215
.loop:
216
    movzx     r2d, byte [r0+r1-1]
217
    sub       r2d, r3d
218
    movd      mm4, r2d
219
%ifidn %1, mmx
220
    punpcklwd mm4, mm4
221
    punpckldq mm4, mm4
222
%else
223
    pshufw    mm4, mm4, 0
224
%endif
225
    movq      mm5, mm4
226
    movq      mm6, mm4
227
    movq      mm7, mm4
228
    paddw     mm4, mm0
229
    paddw     mm5, mm1
230
    paddw     mm6, mm2
231
    paddw     mm7, mm3
232
    packuswb  mm4, mm5
233
    packuswb  mm6, mm7
234
    movq [r0+r1+0], mm4
235
    movq [r0+r1+8], mm6
236
    add        r0, r1
237
    dec       r4d
238
    jg .loop
239
    REP_RET
240
%endmacro
241

    
242
PRED16x16_TM_MMX mmx
243
PRED16x16_TM_MMX mmxext
244

    
245
cglobal pred16x16_tm_vp8_sse2, 2,6,6
246
    sub          r0, r1
247
    pxor       xmm2, xmm2
248
    movdqa     xmm0, [r0]
249
    movdqa     xmm1, xmm0
250
    punpcklbw  xmm0, xmm2
251
    punpckhbw  xmm1, xmm2
252
    movzx       r4d, byte [r0-1]
253
    mov         r5d, 8
254
.loop:
255
    movzx       r2d, byte [r0+r1*1-1]
256
    movzx       r3d, byte [r0+r1*2-1]
257
    sub         r2d, r4d
258
    sub         r3d, r4d
259
    movd       xmm2, r2d
260
    movd       xmm4, r3d
261
    pshuflw    xmm2, xmm2, 0
262
    pshuflw    xmm4, xmm4, 0
263
    punpcklqdq xmm2, xmm2
264
    punpcklqdq xmm4, xmm4
265
    movdqa     xmm3, xmm2
266
    movdqa     xmm5, xmm4
267
    paddw      xmm2, xmm0
268
    paddw      xmm3, xmm1
269
    paddw      xmm4, xmm0
270
    paddw      xmm5, xmm1
271
    packuswb   xmm2, xmm3
272
    packuswb   xmm4, xmm5
273
    movdqa [r0+r1*1], xmm2
274
    movdqa [r0+r1*2], xmm4
275
    lea          r0, [r0+r1*2]
276
    dec         r5d
277
    jg .loop
278
    REP_RET
279

    
280
;-----------------------------------------------------------------------------
281
; void pred16x16_plane(uint8_t *src, int stride)
282
;-----------------------------------------------------------------------------
283

    
284
%macro H264_PRED16x16_PLANE 3
285
cglobal pred16x16_plane_%3_%1, 2, 7, %2
286
    mov          r2, r1           ; +stride
287
    neg          r1               ; -stride
288

    
289
    movh         m0, [r0+r1  -1]
290
%if mmsize == 8
291
    pxor         m4, m4
292
    movh         m1, [r0+r1  +3 ]
293
    movh         m2, [r0+r1  +8 ]
294
    movh         m3, [r0+r1  +12]
295
    punpcklbw    m0, m4
296
    punpcklbw    m1, m4
297
    punpcklbw    m2, m4
298
    punpcklbw    m3, m4
299
    pmullw       m0, [pw_m8tom1  ]
300
    pmullw       m1, [pw_m8tom1+8]
301
    pmullw       m2, [pw_1to8    ]
302
    pmullw       m3, [pw_1to8  +8]
303
    paddw        m0, m2
304
    paddw        m1, m3
305
%else ; mmsize == 16
306
%ifidn %1, sse2
307
    pxor         m2, m2
308
    movh         m1, [r0+r1  +8]
309
    punpcklbw    m0, m2
310
    punpcklbw    m1, m2
311
    pmullw       m0, [pw_m8tom1]
312
    pmullw       m1, [pw_1to8]
313
    paddw        m0, m1
314
%else ; ssse3
315
    movhps       m0, [r0+r1  +8]
316
    pmaddubsw    m0, [plane_shuf] ; H coefficients
317
%endif
318
    movhlps      m1, m0
319
%endif
320
    paddw        m0, m1
321
%ifidn %1, mmx
322
    mova         m1, m0
323
    psrlq        m1, 32
324
%elifidn %1, mmx2
325
    pshufw       m1, m0, 0xE
326
%else ; mmsize == 16
327
    pshuflw      m1, m0, 0xE
328
%endif
329
    paddw        m0, m1
330
%ifidn %1, mmx
331
    mova         m1, m0
332
    psrlq        m1, 16
333
%elifidn %1, mmx2
334
    pshufw       m1, m0, 0x1
335
%else
336
    pshuflw      m1, m0, 0x1
337
%endif
338
    paddw        m0, m1           ; sum of H coefficients
339

    
340
%ifidn %3, h264
341
    pmullw       m0, [pw_5]
342
    paddw        m0, [pw_32]
343
    psraw        m0, 6
344
%elifidn %3, rv40
345
    pmullw       m0, [pw_5]
346
    psraw        m0, 6
347
%elifidn %3, svq3
348
    movd         r3, m0
349
    movsx        r3, r3w
350
    test         r3, r3
351
    lea          r4, [r3+3]
352
    cmovs        r3, r4
353
    sar          r3, 2           ; H/4
354
    lea          r3, [r3*5]      ; 5*(H/4)
355
    test         r3, r3
356
    lea          r4, [r3+15]
357
    cmovs        r3, r4
358
    sar          r3, 4           ; (5*(H/4))/16
359
    movd         m0, r3d
360
%endif
361

    
362
    lea          r4, [r0+r2*8-1]
363
    lea          r3, [r0+r2*4-1]
364
    add          r4, r2
365

    
366
%ifdef ARCH_X86_64
367
%define e_reg r11
368
%else
369
%define e_reg r0
370
%endif
371

    
372
    movzx     e_reg, byte [r3+r2*2   ]
373
    movzx        r5, byte [r4+r1     ]
374
    sub          r5, e_reg
375

    
376
    movzx     e_reg, byte [r3+r2     ]
377
    movzx        r6, byte [r4        ]
378
    sub          r6, e_reg
379
    lea          r5, [r5+r6*2]
380

    
381
    movzx     e_reg, byte [r3+r1     ]
382
    movzx        r6, byte [r4+r2*2   ]
383
    sub          r6, e_reg
384
    lea          r5, [r5+r6*4]
385

    
386
    movzx     e_reg, byte [r3        ]
387
%ifdef ARCH_X86_64
388
    movzx       r10, byte [r4+r2     ]
389
    sub         r10, e_reg
390
%else
391
    movzx        r6, byte [r4+r2     ]
392
    sub          r6, e_reg
393
    lea          r5, [r5+r6*4]
394
    sub          r5, r6
395
%endif
396

    
397
    lea       e_reg, [r3+r1*4]
398
    lea          r3, [r4+r2*4]
399

    
400
    movzx        r4, byte [e_reg+r2  ]
401
    movzx        r6, byte [r3        ]
402
    sub          r6, r4
403
%ifdef ARCH_X86_64
404
    lea          r6, [r10+r6*2]
405
    lea          r5, [r5+r6*2]
406
    add          r5, r6
407
%else
408
    lea          r5, [r5+r6*4]
409
    lea          r5, [r5+r6*2]
410
%endif
411

    
412
    movzx        r4, byte [e_reg     ]
413
%ifdef ARCH_X86_64
414
    movzx       r10, byte [r3   +r2  ]
415
    sub         r10, r4
416
    sub          r5, r10
417
%else
418
    movzx        r6, byte [r3   +r2  ]
419
    sub          r6, r4
420
    lea          r5, [r5+r6*8]
421
    sub          r5, r6
422
%endif
423

    
424
    movzx        r4, byte [e_reg+r1  ]
425
    movzx        r6, byte [r3   +r2*2]
426
    sub          r6, r4
427
%ifdef ARCH_X86_64
428
    add          r6, r10
429
%endif
430
    lea          r5, [r5+r6*8]
431

    
432
    movzx        r4, byte [e_reg+r2*2]
433
    movzx        r6, byte [r3   +r1  ]
434
    sub          r6, r4
435
    lea          r5, [r5+r6*4]
436
    add          r5, r6           ; sum of V coefficients
437

    
438
%ifndef ARCH_X86_64
439
    mov          r0, r0m
440
%endif
441

    
442
%ifidn %3, h264
443
    lea          r5, [r5*5+32]
444
    sar          r5, 6
445
%elifidn %3, rv40
446
    lea          r5, [r5*5]
447
    sar          r5, 6
448
%elifidn %3, svq3
449
    test         r5, r5
450
    lea          r6, [r5+3]
451
    cmovs        r5, r6
452
    sar          r5, 2            ; V/4
453
    lea          r5, [r5*5]       ; 5*(V/4)
454
    test         r5, r5
455
    lea          r6, [r5+15]
456
    cmovs        r5, r6
457
    sar          r5, 4            ; (5*(V/4))/16
458
%endif
459

    
460
    movzx        r4, byte [r0+r1  +15]
461
    movzx        r3, byte [r3+r2*2   ]
462
    lea          r3, [r3+r4+1]
463
    shl          r3, 4
464
    movd        r1d, m0
465
    movsx       r1d, r1w
466
    add         r1d, r5d
467
    add         r3d, r1d
468
    shl         r1d, 3
469
    sub         r3d, r1d          ; a
470

    
471
    movd         m1, r5d
472
    movd         m3, r3d
473
%ifidn %1, mmx
474
    punpcklwd    m0, m0
475
    punpcklwd    m1, m1
476
    punpcklwd    m3, m3
477
    punpckldq    m0, m0
478
    punpckldq    m1, m1
479
    punpckldq    m3, m3
480
%elifidn %1, mmx2
481
    pshufw       m0, m0, 0x0
482
    pshufw       m1, m1, 0x0
483
    pshufw       m3, m3, 0x0
484
%else
485
    pshuflw      m0, m0, 0x0
486
    pshuflw      m1, m1, 0x0
487
    pshuflw      m3, m3, 0x0
488
    punpcklqdq   m0, m0           ; splat H (words)
489
    punpcklqdq   m1, m1           ; splat V (words)
490
    punpcklqdq   m3, m3           ; splat a (words)
491
%endif
492
%ifidn %3, svq3
493
    SWAP          0, 1
494
%endif
495
    mova         m2, m0
496
%if mmsize == 8
497
    mova         m5, m0
498
%endif
499
    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
500
%if mmsize == 16
501
    psllw        m2, 3
502
%else
503
    psllw        m5, 3
504
    psllw        m2, 2
505
    mova         m6, m5
506
    paddw        m6, m2
507
%endif
508
    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
509
    paddw        m2, m0           ; a + {8,9,10,11,12,13,14,15}*H
510
%if mmsize == 8
511
    paddw        m5, m0           ; a + {8,9,10,11}*H
512
    paddw        m6, m0           ; a + {12,13,14,15}*H
513
%endif
514

    
515
    mov          r4, 8
516
.loop
517
    mova         m3, m0           ; b[0..7]
518
    mova         m4, m2           ; b[8..15]
519
    psraw        m3, 5
520
    psraw        m4, 5
521
    packuswb     m3, m4
522
    mova       [r0], m3
523
%if mmsize == 8
524
    mova         m3, m5           ; b[8..11]
525
    mova         m4, m6           ; b[12..15]
526
    psraw        m3, 5
527
    psraw        m4, 5
528
    packuswb     m3, m4
529
    mova     [r0+8], m3
530
%endif
531
    paddw        m0, m1
532
    paddw        m2, m1
533
%if mmsize == 8
534
    paddw        m5, m1
535
    paddw        m6, m1
536
%endif
537

    
538
    mova         m3, m0           ; b[0..7]
539
    mova         m4, m2           ; b[8..15]
540
    psraw        m3, 5
541
    psraw        m4, 5
542
    packuswb     m3, m4
543
    mova    [r0+r2], m3
544
%if mmsize == 8
545
    mova         m3, m5           ; b[8..11]
546
    mova         m4, m6           ; b[12..15]
547
    psraw        m3, 5
548
    psraw        m4, 5
549
    packuswb     m3, m4
550
    mova  [r0+r2+8], m3
551
%endif
552
    paddw        m0, m1
553
    paddw        m2, m1
554
%if mmsize == 8
555
    paddw        m5, m1
556
    paddw        m6, m1
557
%endif
558

    
559
    lea          r0, [r0+r2*2]
560
    dec          r4
561
    jg .loop
562
    REP_RET
563
%endmacro
564

    
565
INIT_MMX
566
H264_PRED16x16_PLANE mmx,   0, h264
567
H264_PRED16x16_PLANE mmx,   0, rv40
568
H264_PRED16x16_PLANE mmx,   0, svq3
569
H264_PRED16x16_PLANE mmx2,  0, h264
570
H264_PRED16x16_PLANE mmx2,  0, rv40
571
H264_PRED16x16_PLANE mmx2,  0, svq3
572
INIT_XMM
573
H264_PRED16x16_PLANE sse2,  8, h264
574
H264_PRED16x16_PLANE sse2,  8, rv40
575
H264_PRED16x16_PLANE sse2,  8, svq3
576
H264_PRED16x16_PLANE ssse3, 8, h264
577
H264_PRED16x16_PLANE ssse3, 8, rv40
578
H264_PRED16x16_PLANE ssse3, 8, svq3
579

    
580
;-----------------------------------------------------------------------------
581
; void pred8x8_plane(uint8_t *src, int stride)
582
;-----------------------------------------------------------------------------
583

    
584
%macro H264_PRED8x8_PLANE 2
585
cglobal pred8x8_plane_%1, 2, 7, %2
586
    mov          r2, r1           ; +stride
587
    neg          r1               ; -stride
588

    
589
    movd         m0, [r0+r1  -1]
590
%if mmsize == 8
591
    pxor         m2, m2
592
    movh         m1, [r0+r1  +4 ]
593
    punpcklbw    m0, m2
594
    punpcklbw    m1, m2
595
    pmullw       m0, [pw_m4to4]
596
    pmullw       m1, [pw_m4to4+8]
597
%else ; mmsize == 16
598
%ifidn %1, sse2
599
    pxor         m2, m2
600
    movd         m1, [r0+r1  +4]
601
    punpckldq    m0, m1
602
    punpcklbw    m0, m2
603
    pmullw       m0, [pw_m4to4]
604
%else ; ssse3
605
    movhps       m0, [r0+r1  +4]   ; this reads 4 bytes more than necessary
606
    pmaddubsw    m0, [plane8_shuf] ; H coefficients
607
%endif
608
    movhlps      m1, m0
609
%endif
610
    paddw        m0, m1
611

    
612
%ifnidn %1, ssse3
613
%ifidn %1, mmx
614
    mova         m1, m0
615
    psrlq        m1, 32
616
%elifidn %1, mmx2
617
    pshufw       m1, m0, 0xE
618
%else ; mmsize == 16
619
    pshuflw      m1, m0, 0xE
620
%endif
621
    paddw        m0, m1
622
%endif ; !ssse3
623

    
624
%ifidn %1, mmx
625
    mova         m1, m0
626
    psrlq        m1, 16
627
%elifidn %1, mmx2
628
    pshufw       m1, m0, 0x1
629
%else
630
    pshuflw      m1, m0, 0x1
631
%endif
632
    paddw        m0, m1           ; sum of H coefficients
633

    
634
    pmullw       m0, [pw_17]
635
    paddw        m0, [pw_16]
636
    psraw        m0, 5
637

    
638
    lea          r4, [r0+r2*4-1]
639
    lea          r3, [r0     -1]
640
    add          r4, r2
641

    
642
%ifdef ARCH_X86_64
643
%define e_reg r11
644
%else
645
%define e_reg r0
646
%endif
647

    
648
    movzx     e_reg, byte [r3+r2*2   ]
649
    movzx        r5, byte [r4+r1     ]
650
    sub          r5, e_reg
651

    
652
    movzx     e_reg, byte [r3        ]
653
%ifdef ARCH_X86_64
654
    movzx       r10, byte [r4+r2     ]
655
    sub         r10, e_reg
656
    sub          r5, r10
657
%else
658
    movzx        r6, byte [r4+r2     ]
659
    sub          r6, e_reg
660
    lea          r5, [r5+r6*4]
661
    sub          r5, r6
662
%endif
663

    
664
    movzx     e_reg, byte [r3+r1     ]
665
    movzx        r6, byte [r4+r2*2   ]
666
    sub          r6, e_reg
667
%ifdef ARCH_X86_64
668
    add          r6, r10
669
%endif
670
    lea          r5, [r5+r6*4]
671

    
672
    movzx     e_reg, byte [r3+r2     ]
673
    movzx        r6, byte [r4        ]
674
    sub          r6, e_reg
675
    lea          r6, [r5+r6*2]
676

    
677
    lea          r5, [r6*9+16]
678
    lea          r5, [r5+r6*8]
679
    sar          r5, 5
680

    
681
%ifndef ARCH_X86_64
682
    mov          r0, r0m
683
%endif
684

    
685
    movzx        r3, byte [r4+r2*2  ]
686
    movzx        r4, byte [r0+r1  +7]
687
    lea          r3, [r3+r4+1]
688
    shl          r3, 4
689
    movd        r1d, m0
690
    movsx       r1d, r1w
691
    add         r1d, r5d
692
    sub         r3d, r1d
693
    add         r1d, r1d
694
    sub         r3d, r1d          ; a
695

    
696
    movd         m1, r5d
697
    movd         m3, r3d
698
%ifidn %1, mmx
699
    punpcklwd    m0, m0
700
    punpcklwd    m1, m1
701
    punpcklwd    m3, m3
702
    punpckldq    m0, m0
703
    punpckldq    m1, m1
704
    punpckldq    m3, m3
705
%elifidn %1, mmx2
706
    pshufw       m0, m0, 0x0
707
    pshufw       m1, m1, 0x0
708
    pshufw       m3, m3, 0x0
709
%else
710
    pshuflw      m0, m0, 0x0
711
    pshuflw      m1, m1, 0x0
712
    pshuflw      m3, m3, 0x0
713
    punpcklqdq   m0, m0           ; splat H (words)
714
    punpcklqdq   m1, m1           ; splat V (words)
715
    punpcklqdq   m3, m3           ; splat a (words)
716
%endif
717
%if mmsize == 8
718
    mova         m2, m0
719
%endif
720
    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
721
    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
722
%if mmsize == 8
723
    psllw        m2, 2
724
    paddw        m2, m0           ; a + {4,5,6,7}*H
725
%endif
726

    
727
    mov          r4, 4
728
ALIGN 16
729
.loop
730
%if mmsize == 16
731
    mova         m3, m0           ; b[0..7]
732
    paddw        m0, m1
733
    psraw        m3, 5
734
    mova         m4, m0           ; V+b[0..7]
735
    paddw        m0, m1
736
    psraw        m4, 5
737
    packuswb     m3, m4
738
    movh       [r0], m3
739
    movhps  [r0+r2], m3
740
%else ; mmsize == 8
741
    mova         m3, m0           ; b[0..3]
742
    mova         m4, m2           ; b[4..7]
743
    paddw        m0, m1
744
    paddw        m2, m1
745
    psraw        m3, 5
746
    psraw        m4, 5
747
    mova         m5, m0           ; V+b[0..3]
748
    mova         m6, m2           ; V+b[4..7]
749
    paddw        m0, m1
750
    paddw        m2, m1
751
    psraw        m5, 5
752
    psraw        m6, 5
753
    packuswb     m3, m4
754
    packuswb     m5, m6
755
    mova       [r0], m3
756
    mova    [r0+r2], m5
757
%endif
758

    
759
    lea          r0, [r0+r2*2]
760
    dec          r4
761
    jg .loop
762
    REP_RET
763
%endmacro
764

    
765
INIT_MMX
766
H264_PRED8x8_PLANE mmx,   0
767
H264_PRED8x8_PLANE mmx2,  0
768
INIT_XMM
769
H264_PRED8x8_PLANE sse2,  8
770
H264_PRED8x8_PLANE ssse3, 8
771

    
772
;-----------------------------------------------------------------------------
773
; void pred8x8_vertical(uint8_t *src, int stride)
774
;-----------------------------------------------------------------------------
775

    
776
cglobal pred8x8_vertical_mmx, 2,2
777
    sub    r0, r1
778
    movq  mm0, [r0]
779
%rep 3
780
    movq [r0+r1*1], mm0
781
    movq [r0+r1*2], mm0
782
    lea    r0, [r0+r1*2]
783
%endrep
784
    movq [r0+r1*1], mm0
785
    movq [r0+r1*2], mm0
786
    RET
787

    
788
;-----------------------------------------------------------------------------
789
; void pred8x8_horizontal(uint8_t *src, int stride)
790
;-----------------------------------------------------------------------------
791

    
792
%macro PRED8x8_H 1
793
cglobal pred8x8_horizontal_%1, 2,3
794
    mov       r2, 4
795
%ifidn %1, ssse3
796
    mova      m2, [pb_3]
797
%endif
798
.loop:
799
    movd      m0, [r0+r1*0-4]
800
    movd      m1, [r0+r1*1-4]
801
%ifidn %1, ssse3
802
    pshufb    m0, m2
803
    pshufb    m1, m2
804
%else
805
    punpcklbw m0, m0
806
    punpcklbw m1, m1
807
%ifidn %1, mmxext
808
    pshufw    m0, m0, 0xff
809
    pshufw    m1, m1, 0xff
810
%else
811
    punpckhwd m0, m0
812
    punpckhwd m1, m1
813
    punpckhdq m0, m0
814
    punpckhdq m1, m1
815
%endif
816
%endif
817
    mova [r0+r1*0], m0
818
    mova [r0+r1*1], m1
819
    lea       r0, [r0+r1*2]
820
    dec       r2
821
    jg .loop
822
    REP_RET
823
%endmacro
824

    
825
INIT_MMX
826
PRED8x8_H mmx
827
PRED8x8_H mmxext
828
PRED8x8_H ssse3
829

    
830
;-----------------------------------------------------------------------------
831
; void pred8x8_dc_rv40(uint8_t *src, int stride)
832
;-----------------------------------------------------------------------------
833

    
834
cglobal pred8x8_dc_rv40_mmxext, 2,7
835
    mov       r4, r0
836
    sub       r0, r1
837
    pxor      mm0, mm0
838
    psadbw    mm0, [r0]
839
    dec        r0
840
    movzx     r5d, byte [r0+r1*1]
841
    movd      r6d, mm0
842
    lea        r0, [r0+r1*2]
843
%rep 3
844
    movzx     r2d, byte [r0+r1*0]
845
    movzx     r3d, byte [r0+r1*1]
846
    add       r5d, r2d
847
    add       r6d, r3d
848
    lea        r0, [r0+r1*2]
849
%endrep
850
    movzx     r2d, byte [r0+r1*0]
851
    add       r5d, r6d
852
    lea       r2d, [r2+r5+8]
853
    shr       r2d, 4
854
    movd      mm0, r2d
855
    punpcklbw mm0, mm0
856
    pshufw    mm0, mm0, 0
857
    mov       r3d, 4
858
.loop:
859
    movq [r4+r1*0], mm0
860
    movq [r4+r1*1], mm0
861
    lea   r4, [r4+r1*2]
862
    dec   r3d
863
    jg .loop
864
    REP_RET
865

    
866
;-----------------------------------------------------------------------------
867
; void pred8x8_tm_vp8(uint8_t *src, int stride)
868
;-----------------------------------------------------------------------------
869

    
870
%macro PRED8x8_TM_MMX 1
871
cglobal pred8x8_tm_vp8_%1, 2,6
872
    sub        r0, r1
873
    pxor      mm7, mm7
874
    movq      mm0, [r0]
875
    movq      mm1, mm0
876
    punpcklbw mm0, mm7
877
    punpckhbw mm1, mm7
878
    movzx     r4d, byte [r0-1]
879
    mov       r5d, 4
880
.loop:
881
    movzx     r2d, byte [r0+r1*1-1]
882
    movzx     r3d, byte [r0+r1*2-1]
883
    sub       r2d, r4d
884
    sub       r3d, r4d
885
    movd      mm2, r2d
886
    movd      mm4, r3d
887
%ifidn %1, mmx
888
    punpcklwd mm2, mm2
889
    punpcklwd mm4, mm4
890
    punpckldq mm2, mm2
891
    punpckldq mm4, mm4
892
%else
893
    pshufw    mm2, mm2, 0
894
    pshufw    mm4, mm4, 0
895
%endif
896
    movq      mm3, mm2
897
    movq      mm5, mm4
898
    paddw     mm2, mm0
899
    paddw     mm3, mm1
900
    paddw     mm4, mm0
901
    paddw     mm5, mm1
902
    packuswb  mm2, mm3
903
    packuswb  mm4, mm5
904
    movq [r0+r1*1], mm2
905
    movq [r0+r1*2], mm4
906
    lea        r0, [r0+r1*2]
907
    dec       r5d
908
    jg .loop
909
    REP_RET
910
%endmacro
911

    
912
PRED8x8_TM_MMX mmx
913
PRED8x8_TM_MMX mmxext
914

    
915
cglobal pred8x8_tm_vp8_sse2, 2,6,4
916
    sub          r0, r1
917
    pxor       xmm1, xmm1
918
    movq       xmm0, [r0]
919
    punpcklbw  xmm0, xmm1
920
    movzx       r4d, byte [r0-1]
921
    mov         r5d, 4
922
.loop:
923
    movzx       r2d, byte [r0+r1*1-1]
924
    movzx       r3d, byte [r0+r1*2-1]
925
    sub         r2d, r4d
926
    sub         r3d, r4d
927
    movd       xmm2, r2d
928
    movd       xmm3, r3d
929
    pshuflw    xmm2, xmm2, 0
930
    pshuflw    xmm3, xmm3, 0
931
    punpcklqdq xmm2, xmm2
932
    punpcklqdq xmm3, xmm3
933
    paddw      xmm2, xmm0
934
    paddw      xmm3, xmm0
935
    packuswb   xmm2, xmm3
936
    movq   [r0+r1*1], xmm2
937
    movhps [r0+r1*2], xmm2
938
    lea          r0, [r0+r1*2]
939
    dec         r5d
940
    jg .loop
941
    REP_RET
942

    
943
cglobal pred8x8_tm_vp8_ssse3, 2,3,6
944
    sub          r0, r1
945
    movdqa     xmm4, [tm_shuf]
946
    pxor       xmm1, xmm1
947
    movq       xmm0, [r0]
948
    punpcklbw  xmm0, xmm1
949
    movd       xmm5, [r0-4]
950
    pshufb     xmm5, xmm4
951
    mov         r2d, 4
952
.loop:
953
    movd       xmm2, [r0+r1*1-4]
954
    movd       xmm3, [r0+r1*2-4]
955
    pshufb     xmm2, xmm4
956
    pshufb     xmm3, xmm4
957
    psubw      xmm2, xmm5
958
    psubw      xmm3, xmm5
959
    paddw      xmm2, xmm0
960
    paddw      xmm3, xmm0
961
    packuswb   xmm2, xmm3
962
    movq   [r0+r1*1], xmm2
963
    movhps [r0+r1*2], xmm2
964
    lea          r0, [r0+r1*2]
965
    dec         r2d
966
    jg .loop
967
    REP_RET
968

    
969
;-----------------------------------------------------------------------------
970
; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
971
;-----------------------------------------------------------------------------
972

    
973
cglobal pred4x4_dc_mmxext, 3,5
974
    pxor   mm7, mm7
975
    mov     r4, r0
976
    sub     r0, r2
977
    movd   mm0, [r0]
978
    psadbw mm0, mm7
979
    movzx  r1d, byte [r0+r2*1-1]
980
    movd   r3d, mm0
981
    add    r3d, r1d
982
    movzx  r1d, byte [r0+r2*2-1]
983
    lea     r0, [r0+r2*2]
984
    add    r3d, r1d
985
    movzx  r1d, byte [r0+r2*1-1]
986
    add    r3d, r1d
987
    movzx  r1d, byte [r0+r2*2-1]
988
    add    r3d, r1d
989
    add    r3d, 4
990
    shr    r3d, 3
991
    imul   r3d, 0x01010101
992
    mov   [r4+r2*0], r3d
993
    mov   [r0+r2*0], r3d
994
    mov   [r0+r2*1], r3d
995
    mov   [r0+r2*2], r3d
996
    RET
997

    
998
;-----------------------------------------------------------------------------
999
; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
1000
;-----------------------------------------------------------------------------
1001

    
1002
%macro PRED4x4_TM_MMX 1
1003
cglobal pred4x4_tm_vp8_%1, 3,6
1004
    sub        r0, r2
1005
    pxor      mm7, mm7
1006
    movd      mm0, [r0]
1007
    punpcklbw mm0, mm7
1008
    movzx     r4d, byte [r0-1]
1009
    mov       r5d, 2
1010
.loop:
1011
    movzx     r1d, byte [r0+r2*1-1]
1012
    movzx     r3d, byte [r0+r2*2-1]
1013
    sub       r1d, r4d
1014
    sub       r3d, r4d
1015
    movd      mm2, r1d
1016
    movd      mm4, r3d
1017
%ifidn %1, mmx
1018
    punpcklwd mm2, mm2
1019
    punpcklwd mm4, mm4
1020
    punpckldq mm2, mm2
1021
    punpckldq mm4, mm4
1022
%else
1023
    pshufw    mm2, mm2, 0
1024
    pshufw    mm4, mm4, 0
1025
%endif
1026
    paddw     mm2, mm0
1027
    paddw     mm4, mm0
1028
    packuswb  mm2, mm2
1029
    packuswb  mm4, mm4
1030
    movd [r0+r2*1], mm2
1031
    movd [r0+r2*2], mm4
1032
    lea        r0, [r0+r2*2]
1033
    dec       r5d
1034
    jg .loop
1035
    REP_RET
1036
%endmacro
1037

    
1038
PRED4x4_TM_MMX mmx
1039
PRED4x4_TM_MMX mmxext
1040

    
1041
cglobal pred4x4_tm_vp8_ssse3, 3,3
1042
    sub         r0, r2
1043
    movq       mm6, [tm_shuf]
1044
    pxor       mm1, mm1
1045
    movd       mm0, [r0]
1046
    punpcklbw  mm0, mm1
1047
    movd       mm7, [r0-4]
1048
    pshufb     mm7, mm6
1049
    lea         r1, [r0+r2*2]
1050
    movd       mm2, [r0+r2*1-4]
1051
    movd       mm3, [r0+r2*2-4]
1052
    movd       mm4, [r1+r2*1-4]
1053
    movd       mm5, [r1+r2*2-4]
1054
    pshufb     mm2, mm6
1055
    pshufb     mm3, mm6
1056
    pshufb     mm4, mm6
1057
    pshufb     mm5, mm6
1058
    psubw      mm2, mm7
1059
    psubw      mm3, mm7
1060
    psubw      mm4, mm7
1061
    psubw      mm5, mm7
1062
    paddw      mm2, mm0
1063
    paddw      mm3, mm0
1064
    paddw      mm4, mm0
1065
    paddw      mm5, mm0
1066
    packuswb   mm2, mm2
1067
    packuswb   mm3, mm3
1068
    packuswb   mm4, mm4
1069
    packuswb   mm5, mm5
1070
    movd [r0+r2*1], mm2
1071
    movd [r0+r2*2], mm3
1072
    movd [r1+r2*1], mm4
1073
    movd [r1+r2*2], mm5
1074
    RET
1075

    
1076
; dest, left, right, src, tmp
1077
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1078
%macro PRED4x4_LOWPASS 5
1079
    mova    %5, %2
1080
    pavgb   %2, %3
1081
    pxor    %3, %5
1082
    mova    %1, %4
1083
    pand    %3, [pb_1]
1084
    psubusb %2, %3
1085
    pavgb   %1, %2
1086
%endmacro
1087

    
1088
;-----------------------------------------------------------------------------
1089
; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
1090
;-----------------------------------------------------------------------------
1091

    
1092
INIT_MMX
1093
cglobal pred4x4_vertical_vp8_mmxext, 3,3
1094
    sub       r0, r2
1095
    movd      m1, [r0-1]
1096
    movd      m0, [r0]
1097
    mova      m2, m0   ;t0 t1 t2 t3
1098
    punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
1099
    lea       r1, [r0+r2*2]
1100
    psrlq     m0, 8    ;t1 t2 t3 t4
1101
    PRED4x4_LOWPASS m3, m1, m0, m2, m4
1102
    movd [r0+r2*1], m3
1103
    movd [r0+r2*2], m3
1104
    movd [r1+r2*1], m3
1105
    movd [r1+r2*2], m3
1106
    RET