Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_intrapred.asm @ 92f441ae

History | View | Annotate | Download (64.3 KB)

1
;******************************************************************************
2
;* H.264 intra prediction asm optimizations
3
;* Copyright (c) 2010 Jason Garrett-Glaser
4
;* Copyright (c) 2010 Holger Lubitz
5
;* Copyright (c) 2010 Loren Merritt
6
;* Copyright (c) 2010 Ronald S. Bultje
7
;*
8
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or
11
;* modify it under the terms of the GNU Lesser General Public
12
;* License as published by the Free Software Foundation; either
13
;* version 2.1 of the License, or (at your option) any later version.
14
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
;* Lesser General Public License for more details.
19
;*
20
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with FFmpeg; if not, write to the Free Software
22
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
;******************************************************************************
24

    
25
%include "x86inc.asm"
26
%include "x86util.asm"
27

    
28
SECTION_RODATA
29

    
30
tm_shuf: times 8 db 0x03, 0x80
31
pw_ff00: times 8 dw 0xff00
32
plane_shuf:  db -8, -7, -6, -5, -4, -3, -2, -1
33
             db  1,  2,  3,  4,  5,  6,  7,  8
34
plane8_shuf: db -4, -3, -2, -1,  0,  0,  0,  0
35
             db  1,  2,  3,  4,  0,  0,  0,  0
36
pw_0to7:     dw  0,  1,  2,  3,  4,  5,  6,  7
37
pw_1to8:     dw  1,  2,  3,  4,  5,  6,  7,  8
38
pw_m8tom1:   dw -8, -7, -6, -5, -4, -3, -2, -1
39
pw_m4to4:    dw -4, -3, -2, -1,  1,  2,  3,  4
40

    
41
SECTION .text
42

    
43
cextern pb_1
44
cextern pb_3
45
cextern pw_4
46
cextern pw_5
47
cextern pw_8
48
cextern pw_16
49
cextern pw_17
50
cextern pw_32
51

    
52
;-----------------------------------------------------------------------------
53
; void pred16x16_vertical(uint8_t *src, int stride)
54
;-----------------------------------------------------------------------------
55

    
56
cglobal pred16x16_vertical_mmx, 2,3
57
    sub   r0, r1
58
    mov   r2, 8
59
    movq mm0, [r0+0]
60
    movq mm1, [r0+8]
61
.loop:
62
    movq [r0+r1*1+0], mm0
63
    movq [r0+r1*1+8], mm1
64
    movq [r0+r1*2+0], mm0
65
    movq [r0+r1*2+8], mm1
66
    lea   r0, [r0+r1*2]
67
    dec   r2
68
    jg .loop
69
    REP_RET
70

    
71
cglobal pred16x16_vertical_sse, 2,3
72
    sub   r0, r1
73
    mov   r2, 4
74
    movaps xmm0, [r0]
75
.loop:
76
    movaps [r0+r1*1], xmm0
77
    movaps [r0+r1*2], xmm0
78
    lea   r0, [r0+r1*2]
79
    movaps [r0+r1*1], xmm0
80
    movaps [r0+r1*2], xmm0
81
    lea   r0, [r0+r1*2]
82
    dec   r2
83
    jg .loop
84
    REP_RET
85

    
86
;-----------------------------------------------------------------------------
87
; void pred16x16_horizontal(uint8_t *src, int stride)
88
;-----------------------------------------------------------------------------
89

    
90
%macro PRED16x16_H 1
91
cglobal pred16x16_horizontal_%1, 2,3
92
    mov       r2, 8
93
%ifidn %1, ssse3
94
    mova      m2, [pb_3]
95
%endif
96
.loop:
97
    movd      m0, [r0+r1*0-4]
98
    movd      m1, [r0+r1*1-4]
99

    
100
%ifidn %1, ssse3
101
    pshufb    m0, m2
102
    pshufb    m1, m2
103
%else
104
    punpcklbw m0, m0
105
    punpcklbw m1, m1
106
%ifidn %1, mmxext
107
    pshufw    m0, m0, 0xff
108
    pshufw    m1, m1, 0xff
109
%else
110
    punpckhwd m0, m0
111
    punpckhwd m1, m1
112
    punpckhdq m0, m0
113
    punpckhdq m1, m1
114
%endif
115
    mova [r0+r1*0+8], m0
116
    mova [r0+r1*1+8], m1
117
%endif
118

    
119
    mova [r0+r1*0], m0
120
    mova [r0+r1*1], m1
121
    lea       r0, [r0+r1*2]
122
    dec       r2
123
    jg .loop
124
    REP_RET
125
%endmacro
126

    
127
INIT_MMX
128
PRED16x16_H mmx
129
PRED16x16_H mmxext
130
INIT_XMM
131
PRED16x16_H ssse3
132

    
133
;-----------------------------------------------------------------------------
134
; void pred16x16_dc(uint8_t *src, int stride)
135
;-----------------------------------------------------------------------------
136

    
137
%macro PRED16x16_DC 1
138
cglobal pred16x16_dc_%1, 2,7
139
    mov       r4, r0
140
    sub       r0, r1
141
    pxor      mm0, mm0
142
    pxor      mm1, mm1
143
    psadbw    mm0, [r0+0]
144
    psadbw    mm1, [r0+8]
145
    dec        r0
146
    movzx     r5d, byte [r0+r1*1]
147
    paddw     mm0, mm1
148
    movd      r6d, mm0
149
    lea        r0, [r0+r1*2]
150
%rep 7
151
    movzx     r2d, byte [r0+r1*0]
152
    movzx     r3d, byte [r0+r1*1]
153
    add       r5d, r2d
154
    add       r6d, r3d
155
    lea        r0, [r0+r1*2]
156
%endrep
157
    movzx     r2d, byte [r0+r1*0]
158
    add       r5d, r6d
159
    lea       r2d, [r2+r5+16]
160
    shr       r2d, 5
161
%ifidn %1, mmxext
162
    movd       m0, r2d
163
    punpcklbw  m0, m0
164
    pshufw     m0, m0, 0
165
%elifidn %1, sse2
166
    movd       m0, r2d
167
    punpcklbw  m0, m0
168
    pshuflw    m0, m0, 0
169
    punpcklqdq m0, m0
170
%elifidn %1, ssse3
171
    pxor       m1, m1
172
    movd       m0, r2d
173
    pshufb     m0, m1
174
%endif
175

    
176
%if mmsize==8
177
    mov       r3d, 8
178
.loop:
179
    mova [r4+r1*0+0], m0
180
    mova [r4+r1*0+8], m0
181
    mova [r4+r1*1+0], m0
182
    mova [r4+r1*1+8], m0
183
%else
184
    mov       r3d, 4
185
.loop:
186
    mova [r4+r1*0], m0
187
    mova [r4+r1*1], m0
188
    lea   r4, [r4+r1*2]
189
    mova [r4+r1*0], m0
190
    mova [r4+r1*1], m0
191
%endif
192
    lea   r4, [r4+r1*2]
193
    dec   r3d
194
    jg .loop
195
    REP_RET
196
%endmacro
197

    
198
INIT_MMX
199
PRED16x16_DC mmxext
200
INIT_XMM
201
PRED16x16_DC   sse2
202
PRED16x16_DC  ssse3
203

    
204
;-----------------------------------------------------------------------------
205
; void pred16x16_tm_vp8(uint8_t *src, int stride)
206
;-----------------------------------------------------------------------------
207

    
208
%macro PRED16x16_TM_MMX 1
209
cglobal pred16x16_tm_vp8_%1, 2,5
210
    sub        r0, r1
211
    pxor      mm7, mm7
212
    movq      mm0, [r0+0]
213
    movq      mm2, [r0+8]
214
    movq      mm1, mm0
215
    movq      mm3, mm2
216
    punpcklbw mm0, mm7
217
    punpckhbw mm1, mm7
218
    punpcklbw mm2, mm7
219
    punpckhbw mm3, mm7
220
    movzx     r3d, byte [r0-1]
221
    mov       r4d, 16
222
.loop:
223
    movzx     r2d, byte [r0+r1-1]
224
    sub       r2d, r3d
225
    movd      mm4, r2d
226
%ifidn %1, mmx
227
    punpcklwd mm4, mm4
228
    punpckldq mm4, mm4
229
%else
230
    pshufw    mm4, mm4, 0
231
%endif
232
    movq      mm5, mm4
233
    movq      mm6, mm4
234
    movq      mm7, mm4
235
    paddw     mm4, mm0
236
    paddw     mm5, mm1
237
    paddw     mm6, mm2
238
    paddw     mm7, mm3
239
    packuswb  mm4, mm5
240
    packuswb  mm6, mm7
241
    movq [r0+r1+0], mm4
242
    movq [r0+r1+8], mm6
243
    add        r0, r1
244
    dec       r4d
245
    jg .loop
246
    REP_RET
247
%endmacro
248

    
249
PRED16x16_TM_MMX mmx
250
PRED16x16_TM_MMX mmxext
251

    
252
cglobal pred16x16_tm_vp8_sse2, 2,6,6
253
    sub          r0, r1
254
    pxor       xmm2, xmm2
255
    movdqa     xmm0, [r0]
256
    movdqa     xmm1, xmm0
257
    punpcklbw  xmm0, xmm2
258
    punpckhbw  xmm1, xmm2
259
    movzx       r4d, byte [r0-1]
260
    mov         r5d, 8
261
.loop:
262
    movzx       r2d, byte [r0+r1*1-1]
263
    movzx       r3d, byte [r0+r1*2-1]
264
    sub         r2d, r4d
265
    sub         r3d, r4d
266
    movd       xmm2, r2d
267
    movd       xmm4, r3d
268
    pshuflw    xmm2, xmm2, 0
269
    pshuflw    xmm4, xmm4, 0
270
    punpcklqdq xmm2, xmm2
271
    punpcklqdq xmm4, xmm4
272
    movdqa     xmm3, xmm2
273
    movdqa     xmm5, xmm4
274
    paddw      xmm2, xmm0
275
    paddw      xmm3, xmm1
276
    paddw      xmm4, xmm0
277
    paddw      xmm5, xmm1
278
    packuswb   xmm2, xmm3
279
    packuswb   xmm4, xmm5
280
    movdqa [r0+r1*1], xmm2
281
    movdqa [r0+r1*2], xmm4
282
    lea          r0, [r0+r1*2]
283
    dec         r5d
284
    jg .loop
285
    REP_RET
286

    
287
;-----------------------------------------------------------------------------
288
; void pred16x16_plane(uint8_t *src, int stride)
289
;-----------------------------------------------------------------------------
290

    
291
%macro H264_PRED16x16_PLANE 3
292
cglobal pred16x16_plane_%3_%1, 2, 7, %2
293
    mov          r2, r1           ; +stride
294
    neg          r1               ; -stride
295

    
296
    movh         m0, [r0+r1  -1]
297
%if mmsize == 8
298
    pxor         m4, m4
299
    movh         m1, [r0+r1  +3 ]
300
    movh         m2, [r0+r1  +8 ]
301
    movh         m3, [r0+r1  +12]
302
    punpcklbw    m0, m4
303
    punpcklbw    m1, m4
304
    punpcklbw    m2, m4
305
    punpcklbw    m3, m4
306
    pmullw       m0, [pw_m8tom1  ]
307
    pmullw       m1, [pw_m8tom1+8]
308
    pmullw       m2, [pw_1to8    ]
309
    pmullw       m3, [pw_1to8  +8]
310
    paddw        m0, m2
311
    paddw        m1, m3
312
%else ; mmsize == 16
313
%ifidn %1, sse2
314
    pxor         m2, m2
315
    movh         m1, [r0+r1  +8]
316
    punpcklbw    m0, m2
317
    punpcklbw    m1, m2
318
    pmullw       m0, [pw_m8tom1]
319
    pmullw       m1, [pw_1to8]
320
    paddw        m0, m1
321
%else ; ssse3
322
    movhps       m0, [r0+r1  +8]
323
    pmaddubsw    m0, [plane_shuf] ; H coefficients
324
%endif
325
    movhlps      m1, m0
326
%endif
327
    paddw        m0, m1
328
%ifidn %1, mmx
329
    mova         m1, m0
330
    psrlq        m1, 32
331
%elifidn %1, mmx2
332
    pshufw       m1, m0, 0xE
333
%else ; mmsize == 16
334
    pshuflw      m1, m0, 0xE
335
%endif
336
    paddw        m0, m1
337
%ifidn %1, mmx
338
    mova         m1, m0
339
    psrlq        m1, 16
340
%elifidn %1, mmx2
341
    pshufw       m1, m0, 0x1
342
%else
343
    pshuflw      m1, m0, 0x1
344
%endif
345
    paddw        m0, m1           ; sum of H coefficients
346

    
347
%ifidn %3, h264
348
    pmullw       m0, [pw_5]
349
    paddw        m0, [pw_32]
350
    psraw        m0, 6
351
%elifidn %3, rv40
352
    pmullw       m0, [pw_5]
353
    psraw        m0, 6
354
%elifidn %3, svq3
355
    movd        r3d, m0
356
    movsx        r3, r3w
357
    test         r3, r3
358
    lea          r4, [r3+3]
359
    cmovs        r3, r4
360
    sar          r3, 2           ; H/4
361
    lea          r3, [r3*5]      ; 5*(H/4)
362
    test         r3, r3
363
    lea          r4, [r3+15]
364
    cmovs        r3, r4
365
    sar          r3, 4           ; (5*(H/4))/16
366
    movd         m0, r3d
367
%endif
368

    
369
    lea          r4, [r0+r2*8-1]
370
    lea          r3, [r0+r2*4-1]
371
    add          r4, r2
372

    
373
%ifdef ARCH_X86_64
374
%define e_reg r11
375
%else
376
%define e_reg r0
377
%endif
378

    
379
    movzx     e_reg, byte [r3+r2*2   ]
380
    movzx        r5, byte [r4+r1     ]
381
    sub          r5, e_reg
382

    
383
    movzx     e_reg, byte [r3+r2     ]
384
    movzx        r6, byte [r4        ]
385
    sub          r6, e_reg
386
    lea          r5, [r5+r6*2]
387

    
388
    movzx     e_reg, byte [r3+r1     ]
389
    movzx        r6, byte [r4+r2*2   ]
390
    sub          r6, e_reg
391
    lea          r5, [r5+r6*4]
392

    
393
    movzx     e_reg, byte [r3        ]
394
%ifdef ARCH_X86_64
395
    movzx       r10, byte [r4+r2     ]
396
    sub         r10, e_reg
397
%else
398
    movzx        r6, byte [r4+r2     ]
399
    sub          r6, e_reg
400
    lea          r5, [r5+r6*4]
401
    sub          r5, r6
402
%endif
403

    
404
    lea       e_reg, [r3+r1*4]
405
    lea          r3, [r4+r2*4]
406

    
407
    movzx        r4, byte [e_reg+r2  ]
408
    movzx        r6, byte [r3        ]
409
    sub          r6, r4
410
%ifdef ARCH_X86_64
411
    lea          r6, [r10+r6*2]
412
    lea          r5, [r5+r6*2]
413
    add          r5, r6
414
%else
415
    lea          r5, [r5+r6*4]
416
    lea          r5, [r5+r6*2]
417
%endif
418

    
419
    movzx        r4, byte [e_reg     ]
420
%ifdef ARCH_X86_64
421
    movzx       r10, byte [r3   +r2  ]
422
    sub         r10, r4
423
    sub          r5, r10
424
%else
425
    movzx        r6, byte [r3   +r2  ]
426
    sub          r6, r4
427
    lea          r5, [r5+r6*8]
428
    sub          r5, r6
429
%endif
430

    
431
    movzx        r4, byte [e_reg+r1  ]
432
    movzx        r6, byte [r3   +r2*2]
433
    sub          r6, r4
434
%ifdef ARCH_X86_64
435
    add          r6, r10
436
%endif
437
    lea          r5, [r5+r6*8]
438

    
439
    movzx        r4, byte [e_reg+r2*2]
440
    movzx        r6, byte [r3   +r1  ]
441
    sub          r6, r4
442
    lea          r5, [r5+r6*4]
443
    add          r5, r6           ; sum of V coefficients
444

    
445
%ifndef ARCH_X86_64
446
    mov          r0, r0m
447
%endif
448

    
449
%ifidn %3, h264
450
    lea          r5, [r5*5+32]
451
    sar          r5, 6
452
%elifidn %3, rv40
453
    lea          r5, [r5*5]
454
    sar          r5, 6
455
%elifidn %3, svq3
456
    test         r5, r5
457
    lea          r6, [r5+3]
458
    cmovs        r5, r6
459
    sar          r5, 2            ; V/4
460
    lea          r5, [r5*5]       ; 5*(V/4)
461
    test         r5, r5
462
    lea          r6, [r5+15]
463
    cmovs        r5, r6
464
    sar          r5, 4            ; (5*(V/4))/16
465
%endif
466

    
467
    movzx        r4, byte [r0+r1  +15]
468
    movzx        r3, byte [r3+r2*2   ]
469
    lea          r3, [r3+r4+1]
470
    shl          r3, 4
471
    movd        r1d, m0
472
    movsx       r1d, r1w
473
    add         r1d, r5d
474
    add         r3d, r1d
475
    shl         r1d, 3
476
    sub         r3d, r1d          ; a
477

    
478
    movd         m1, r5d
479
    movd         m3, r3d
480
%ifidn %1, mmx
481
    punpcklwd    m0, m0
482
    punpcklwd    m1, m1
483
    punpcklwd    m3, m3
484
    punpckldq    m0, m0
485
    punpckldq    m1, m1
486
    punpckldq    m3, m3
487
%elifidn %1, mmx2
488
    pshufw       m0, m0, 0x0
489
    pshufw       m1, m1, 0x0
490
    pshufw       m3, m3, 0x0
491
%else
492
    pshuflw      m0, m0, 0x0
493
    pshuflw      m1, m1, 0x0
494
    pshuflw      m3, m3, 0x0
495
    punpcklqdq   m0, m0           ; splat H (words)
496
    punpcklqdq   m1, m1           ; splat V (words)
497
    punpcklqdq   m3, m3           ; splat a (words)
498
%endif
499
%ifidn %3, svq3
500
    SWAP          0, 1
501
%endif
502
    mova         m2, m0
503
%if mmsize == 8
504
    mova         m5, m0
505
%endif
506
    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
507
%if mmsize == 16
508
    psllw        m2, 3
509
%else
510
    psllw        m5, 3
511
    psllw        m2, 2
512
    mova         m6, m5
513
    paddw        m6, m2
514
%endif
515
    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
516
    paddw        m2, m0           ; a + {8,9,10,11,12,13,14,15}*H
517
%if mmsize == 8
518
    paddw        m5, m0           ; a + {8,9,10,11}*H
519
    paddw        m6, m0           ; a + {12,13,14,15}*H
520
%endif
521

    
522
    mov          r4, 8
523
.loop
524
    mova         m3, m0           ; b[0..7]
525
    mova         m4, m2           ; b[8..15]
526
    psraw        m3, 5
527
    psraw        m4, 5
528
    packuswb     m3, m4
529
    mova       [r0], m3
530
%if mmsize == 8
531
    mova         m3, m5           ; b[8..11]
532
    mova         m4, m6           ; b[12..15]
533
    psraw        m3, 5
534
    psraw        m4, 5
535
    packuswb     m3, m4
536
    mova     [r0+8], m3
537
%endif
538
    paddw        m0, m1
539
    paddw        m2, m1
540
%if mmsize == 8
541
    paddw        m5, m1
542
    paddw        m6, m1
543
%endif
544

    
545
    mova         m3, m0           ; b[0..7]
546
    mova         m4, m2           ; b[8..15]
547
    psraw        m3, 5
548
    psraw        m4, 5
549
    packuswb     m3, m4
550
    mova    [r0+r2], m3
551
%if mmsize == 8
552
    mova         m3, m5           ; b[8..11]
553
    mova         m4, m6           ; b[12..15]
554
    psraw        m3, 5
555
    psraw        m4, 5
556
    packuswb     m3, m4
557
    mova  [r0+r2+8], m3
558
%endif
559
    paddw        m0, m1
560
    paddw        m2, m1
561
%if mmsize == 8
562
    paddw        m5, m1
563
    paddw        m6, m1
564
%endif
565

    
566
    lea          r0, [r0+r2*2]
567
    dec          r4
568
    jg .loop
569
    REP_RET
570
%endmacro
571

    
572
INIT_MMX
573
H264_PRED16x16_PLANE mmx,   0, h264
574
H264_PRED16x16_PLANE mmx,   0, rv40
575
H264_PRED16x16_PLANE mmx,   0, svq3
576
H264_PRED16x16_PLANE mmx2,  0, h264
577
H264_PRED16x16_PLANE mmx2,  0, rv40
578
H264_PRED16x16_PLANE mmx2,  0, svq3
579
INIT_XMM
580
H264_PRED16x16_PLANE sse2,  8, h264
581
H264_PRED16x16_PLANE sse2,  8, rv40
582
H264_PRED16x16_PLANE sse2,  8, svq3
583
H264_PRED16x16_PLANE ssse3, 8, h264
584
H264_PRED16x16_PLANE ssse3, 8, rv40
585
H264_PRED16x16_PLANE ssse3, 8, svq3
586

    
587
;-----------------------------------------------------------------------------
588
; void pred8x8_plane(uint8_t *src, int stride)
589
;-----------------------------------------------------------------------------
590

    
591
%macro H264_PRED8x8_PLANE 2
592
cglobal pred8x8_plane_%1, 2, 7, %2
593
    mov          r2, r1           ; +stride
594
    neg          r1               ; -stride
595

    
596
    movd         m0, [r0+r1  -1]
597
%if mmsize == 8
598
    pxor         m2, m2
599
    movh         m1, [r0+r1  +4 ]
600
    punpcklbw    m0, m2
601
    punpcklbw    m1, m2
602
    pmullw       m0, [pw_m4to4]
603
    pmullw       m1, [pw_m4to4+8]
604
%else ; mmsize == 16
605
%ifidn %1, sse2
606
    pxor         m2, m2
607
    movd         m1, [r0+r1  +4]
608
    punpckldq    m0, m1
609
    punpcklbw    m0, m2
610
    pmullw       m0, [pw_m4to4]
611
%else ; ssse3
612
    movhps       m0, [r0+r1  +4]   ; this reads 4 bytes more than necessary
613
    pmaddubsw    m0, [plane8_shuf] ; H coefficients
614
%endif
615
    movhlps      m1, m0
616
%endif
617
    paddw        m0, m1
618

    
619
%ifnidn %1, ssse3
620
%ifidn %1, mmx
621
    mova         m1, m0
622
    psrlq        m1, 32
623
%elifidn %1, mmx2
624
    pshufw       m1, m0, 0xE
625
%else ; mmsize == 16
626
    pshuflw      m1, m0, 0xE
627
%endif
628
    paddw        m0, m1
629
%endif ; !ssse3
630

    
631
%ifidn %1, mmx
632
    mova         m1, m0
633
    psrlq        m1, 16
634
%elifidn %1, mmx2
635
    pshufw       m1, m0, 0x1
636
%else
637
    pshuflw      m1, m0, 0x1
638
%endif
639
    paddw        m0, m1           ; sum of H coefficients
640

    
641
    pmullw       m0, [pw_17]
642
    paddw        m0, [pw_16]
643
    psraw        m0, 5
644

    
645
    lea          r4, [r0+r2*4-1]
646
    lea          r3, [r0     -1]
647
    add          r4, r2
648

    
649
%ifdef ARCH_X86_64
650
%define e_reg r11
651
%else
652
%define e_reg r0
653
%endif
654

    
655
    movzx     e_reg, byte [r3+r2*2   ]
656
    movzx        r5, byte [r4+r1     ]
657
    sub          r5, e_reg
658

    
659
    movzx     e_reg, byte [r3        ]
660
%ifdef ARCH_X86_64
661
    movzx       r10, byte [r4+r2     ]
662
    sub         r10, e_reg
663
    sub          r5, r10
664
%else
665
    movzx        r6, byte [r4+r2     ]
666
    sub          r6, e_reg
667
    lea          r5, [r5+r6*4]
668
    sub          r5, r6
669
%endif
670

    
671
    movzx     e_reg, byte [r3+r1     ]
672
    movzx        r6, byte [r4+r2*2   ]
673
    sub          r6, e_reg
674
%ifdef ARCH_X86_64
675
    add          r6, r10
676
%endif
677
    lea          r5, [r5+r6*4]
678

    
679
    movzx     e_reg, byte [r3+r2     ]
680
    movzx        r6, byte [r4        ]
681
    sub          r6, e_reg
682
    lea          r6, [r5+r6*2]
683

    
684
    lea          r5, [r6*9+16]
685
    lea          r5, [r5+r6*8]
686
    sar          r5, 5
687

    
688
%ifndef ARCH_X86_64
689
    mov          r0, r0m
690
%endif
691

    
692
    movzx        r3, byte [r4+r2*2  ]
693
    movzx        r4, byte [r0+r1  +7]
694
    lea          r3, [r3+r4+1]
695
    shl          r3, 4
696
    movd        r1d, m0
697
    movsx       r1d, r1w
698
    add         r1d, r5d
699
    sub         r3d, r1d
700
    add         r1d, r1d
701
    sub         r3d, r1d          ; a
702

    
703
    movd         m1, r5d
704
    movd         m3, r3d
705
%ifidn %1, mmx
706
    punpcklwd    m0, m0
707
    punpcklwd    m1, m1
708
    punpcklwd    m3, m3
709
    punpckldq    m0, m0
710
    punpckldq    m1, m1
711
    punpckldq    m3, m3
712
%elifidn %1, mmx2
713
    pshufw       m0, m0, 0x0
714
    pshufw       m1, m1, 0x0
715
    pshufw       m3, m3, 0x0
716
%else
717
    pshuflw      m0, m0, 0x0
718
    pshuflw      m1, m1, 0x0
719
    pshuflw      m3, m3, 0x0
720
    punpcklqdq   m0, m0           ; splat H (words)
721
    punpcklqdq   m1, m1           ; splat V (words)
722
    punpcklqdq   m3, m3           ; splat a (words)
723
%endif
724
%if mmsize == 8
725
    mova         m2, m0
726
%endif
727
    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
728
    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
729
%if mmsize == 8
730
    psllw        m2, 2
731
    paddw        m2, m0           ; a + {4,5,6,7}*H
732
%endif
733

    
734
    mov          r4, 4
735
ALIGN 16
736
.loop
737
%if mmsize == 16
738
    mova         m3, m0           ; b[0..7]
739
    paddw        m0, m1
740
    psraw        m3, 5
741
    mova         m4, m0           ; V+b[0..7]
742
    paddw        m0, m1
743
    psraw        m4, 5
744
    packuswb     m3, m4
745
    movh       [r0], m3
746
    movhps  [r0+r2], m3
747
%else ; mmsize == 8
748
    mova         m3, m0           ; b[0..3]
749
    mova         m4, m2           ; b[4..7]
750
    paddw        m0, m1
751
    paddw        m2, m1
752
    psraw        m3, 5
753
    psraw        m4, 5
754
    mova         m5, m0           ; V+b[0..3]
755
    mova         m6, m2           ; V+b[4..7]
756
    paddw        m0, m1
757
    paddw        m2, m1
758
    psraw        m5, 5
759
    psraw        m6, 5
760
    packuswb     m3, m4
761
    packuswb     m5, m6
762
    mova       [r0], m3
763
    mova    [r0+r2], m5
764
%endif
765

    
766
    lea          r0, [r0+r2*2]
767
    dec          r4
768
    jg .loop
769
    REP_RET
770
%endmacro
771

    
772
INIT_MMX
773
H264_PRED8x8_PLANE mmx,   0
774
H264_PRED8x8_PLANE mmx2,  0
775
INIT_XMM
776
H264_PRED8x8_PLANE sse2,  8
777
H264_PRED8x8_PLANE ssse3, 8
778

    
779
;-----------------------------------------------------------------------------
780
; void pred8x8_vertical(uint8_t *src, int stride)
781
;-----------------------------------------------------------------------------
782

    
783
cglobal pred8x8_vertical_mmx, 2,2
784
    sub    r0, r1
785
    movq  mm0, [r0]
786
%rep 3
787
    movq [r0+r1*1], mm0
788
    movq [r0+r1*2], mm0
789
    lea    r0, [r0+r1*2]
790
%endrep
791
    movq [r0+r1*1], mm0
792
    movq [r0+r1*2], mm0
793
    RET
794

    
795
;-----------------------------------------------------------------------------
796
; void pred8x8_horizontal(uint8_t *src, int stride)
797
;-----------------------------------------------------------------------------
798

    
799
%macro PRED8x8_H 1
800
cglobal pred8x8_horizontal_%1, 2,3
801
    mov       r2, 4
802
%ifidn %1, ssse3
803
    mova      m2, [pb_3]
804
%endif
805
.loop:
806
    movd      m0, [r0+r1*0-4]
807
    movd      m1, [r0+r1*1-4]
808
%ifidn %1, ssse3
809
    pshufb    m0, m2
810
    pshufb    m1, m2
811
%else
812
    punpcklbw m0, m0
813
    punpcklbw m1, m1
814
%ifidn %1, mmxext
815
    pshufw    m0, m0, 0xff
816
    pshufw    m1, m1, 0xff
817
%else
818
    punpckhwd m0, m0
819
    punpckhwd m1, m1
820
    punpckhdq m0, m0
821
    punpckhdq m1, m1
822
%endif
823
%endif
824
    mova [r0+r1*0], m0
825
    mova [r0+r1*1], m1
826
    lea       r0, [r0+r1*2]
827
    dec       r2
828
    jg .loop
829
    REP_RET
830
%endmacro
831

    
832
INIT_MMX
833
PRED8x8_H mmx
834
PRED8x8_H mmxext
835
PRED8x8_H ssse3
836

    
837
;-----------------------------------------------------------------------------
838
; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
839
;-----------------------------------------------------------------------------
840
%ifdef CONFIG_GPL
841
cglobal pred8x8_top_dc_mmxext, 2,5
842
    sub         r0, r1
843
    movq       mm0, [r0]
844
    pxor       mm1, mm1
845
    pxor       mm2, mm2
846
    lea         r2, [r0+r1*2]
847
    punpckhbw  mm1, mm0
848
    punpcklbw  mm0, mm2
849
    psadbw     mm1, mm2        ; s1
850
    lea         r3, [r2+r1*2]
851
    psadbw     mm0, mm2        ; s0
852
    psrlw      mm1, 1
853
    psrlw      mm0, 1
854
    pavgw      mm1, mm2
855
    lea         r4, [r3+r1*2]
856
    pavgw      mm0, mm2
857
    pshufw     mm1, mm1, 0
858
    pshufw     mm0, mm0, 0     ; dc0 (w)
859
    packuswb   mm0, mm1        ; dc0,dc1 (b)
860
    movq [r0+r1*1], mm0
861
    movq [r0+r1*2], mm0
862
    lea         r0, [r3+r1*2]
863
    movq [r2+r1*1], mm0
864
    movq [r2+r1*2], mm0
865
    movq [r3+r1*1], mm0
866
    movq [r3+r1*2], mm0
867
    movq [r0+r1*1], mm0
868
    movq [r0+r1*2], mm0
869
    RET
870

    
871
;-----------------------------------------------------------------------------
872
; void pred8x8_dc_mmxext(uint8_t *src, int stride)
873
;-----------------------------------------------------------------------------
874

    
875
INIT_MMX
876
cglobal pred8x8_dc_mmxext, 2,5
877
    sub       r0, r1
878
    pxor      m7, m7
879
    movd      m0, [r0+0]
880
    movd      m1, [r0+4]
881
    psadbw    m0, m7            ; s0
882
    mov       r4, r0
883
    psadbw    m1, m7            ; s1
884

    
885
    movzx    r2d, byte [r0+r1*1-1]
886
    movzx    r3d, byte [r0+r1*2-1]
887
    lea       r0, [r0+r1*2]
888
    add      r2d, r3d
889
    movzx    r3d, byte [r0+r1*1-1]
890
    add      r2d, r3d
891
    movzx    r3d, byte [r0+r1*2-1]
892
    add      r2d, r3d
893
    lea       r0, [r0+r1*2]
894
    movd      m2, r2d            ; s2
895
    movzx    r2d, byte [r0+r1*1-1]
896
    movzx    r3d, byte [r0+r1*2-1]
897
    lea       r0, [r0+r1*2]
898
    add      r2d, r3d
899
    movzx    r3d, byte [r0+r1*1-1]
900
    add      r2d, r3d
901
    movzx    r3d, byte [r0+r1*2-1]
902
    add      r2d, r3d
903
    movd      m3, r2d            ; s3
904

    
905
    punpcklwd m0, m1
906
    mov       r0, r4
907
    punpcklwd m2, m3
908
    punpckldq m0, m2            ; s0, s1, s2, s3
909
    pshufw    m3, m0, 11110110b ; s2, s1, s3, s3
910
    lea       r2, [r0+r1*2]
911
    pshufw    m0, m0, 01110100b ; s0, s1, s3, s1
912
    paddw     m0, m3
913
    lea       r3, [r2+r1*2]
914
    psrlw     m0, 2
915
    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
916
    lea       r4, [r3+r1*2]
917
    packuswb  m0, m0
918
    punpcklbw m0, m0
919
    movq      m1, m0
920
    punpcklbw m0, m0
921
    punpckhbw m1, m1
922
    movq [r0+r1*1], m0
923
    movq [r0+r1*2], m0
924
    movq [r2+r1*1], m0
925
    movq [r2+r1*2], m0
926
    movq [r3+r1*1], m1
927
    movq [r3+r1*2], m1
928
    movq [r4+r1*1], m1
929
    movq [r4+r1*2], m1
930
    RET
931
%endif
932

    
933
;-----------------------------------------------------------------------------
934
; void pred8x8_dc_rv40(uint8_t *src, int stride)
935
;-----------------------------------------------------------------------------
936

    
937
cglobal pred8x8_dc_rv40_mmxext, 2,7
938
    mov       r4, r0
939
    sub       r0, r1
940
    pxor      mm0, mm0
941
    psadbw    mm0, [r0]
942
    dec        r0
943
    movzx     r5d, byte [r0+r1*1]
944
    movd      r6d, mm0
945
    lea        r0, [r0+r1*2]
946
%rep 3
947
    movzx     r2d, byte [r0+r1*0]
948
    movzx     r3d, byte [r0+r1*1]
949
    add       r5d, r2d
950
    add       r6d, r3d
951
    lea        r0, [r0+r1*2]
952
%endrep
953
    movzx     r2d, byte [r0+r1*0]
954
    add       r5d, r6d
955
    lea       r2d, [r2+r5+8]
956
    shr       r2d, 4
957
    movd      mm0, r2d
958
    punpcklbw mm0, mm0
959
    pshufw    mm0, mm0, 0
960
    mov       r3d, 4
961
.loop:
962
    movq [r4+r1*0], mm0
963
    movq [r4+r1*1], mm0
964
    lea   r4, [r4+r1*2]
965
    dec   r3d
966
    jg .loop
967
    REP_RET
968

    
969
;-----------------------------------------------------------------------------
970
; void pred8x8_tm_vp8(uint8_t *src, int stride)
971
;-----------------------------------------------------------------------------
972

    
973
%macro PRED8x8_TM_MMX 1
974
cglobal pred8x8_tm_vp8_%1, 2,6
975
    sub        r0, r1
976
    pxor      mm7, mm7
977
    movq      mm0, [r0]
978
    movq      mm1, mm0
979
    punpcklbw mm0, mm7
980
    punpckhbw mm1, mm7
981
    movzx     r4d, byte [r0-1]
982
    mov       r5d, 4
983
.loop:
984
    movzx     r2d, byte [r0+r1*1-1]
985
    movzx     r3d, byte [r0+r1*2-1]
986
    sub       r2d, r4d
987
    sub       r3d, r4d
988
    movd      mm2, r2d
989
    movd      mm4, r3d
990
%ifidn %1, mmx
991
    punpcklwd mm2, mm2
992
    punpcklwd mm4, mm4
993
    punpckldq mm2, mm2
994
    punpckldq mm4, mm4
995
%else
996
    pshufw    mm2, mm2, 0
997
    pshufw    mm4, mm4, 0
998
%endif
999
    movq      mm3, mm2
1000
    movq      mm5, mm4
1001
    paddw     mm2, mm0
1002
    paddw     mm3, mm1
1003
    paddw     mm4, mm0
1004
    paddw     mm5, mm1
1005
    packuswb  mm2, mm3
1006
    packuswb  mm4, mm5
1007
    movq [r0+r1*1], mm2
1008
    movq [r0+r1*2], mm4
1009
    lea        r0, [r0+r1*2]
1010
    dec       r5d
1011
    jg .loop
1012
    REP_RET
1013
%endmacro
1014

    
1015
PRED8x8_TM_MMX mmx
1016
PRED8x8_TM_MMX mmxext
1017

    
1018
cglobal pred8x8_tm_vp8_sse2, 2,6,4
1019
    sub          r0, r1
1020
    pxor       xmm1, xmm1
1021
    movq       xmm0, [r0]
1022
    punpcklbw  xmm0, xmm1
1023
    movzx       r4d, byte [r0-1]
1024
    mov         r5d, 4
1025
.loop:
1026
    movzx       r2d, byte [r0+r1*1-1]
1027
    movzx       r3d, byte [r0+r1*2-1]
1028
    sub         r2d, r4d
1029
    sub         r3d, r4d
1030
    movd       xmm2, r2d
1031
    movd       xmm3, r3d
1032
    pshuflw    xmm2, xmm2, 0
1033
    pshuflw    xmm3, xmm3, 0
1034
    punpcklqdq xmm2, xmm2
1035
    punpcklqdq xmm3, xmm3
1036
    paddw      xmm2, xmm0
1037
    paddw      xmm3, xmm0
1038
    packuswb   xmm2, xmm3
1039
    movq   [r0+r1*1], xmm2
1040
    movhps [r0+r1*2], xmm2
1041
    lea          r0, [r0+r1*2]
1042
    dec         r5d
1043
    jg .loop
1044
    REP_RET
1045

    
1046
cglobal pred8x8_tm_vp8_ssse3, 2,3,6
1047
    sub          r0, r1
1048
    movdqa     xmm4, [tm_shuf]
1049
    pxor       xmm1, xmm1
1050
    movq       xmm0, [r0]
1051
    punpcklbw  xmm0, xmm1
1052
    movd       xmm5, [r0-4]
1053
    pshufb     xmm5, xmm4
1054
    mov         r2d, 4
1055
.loop:
1056
    movd       xmm2, [r0+r1*1-4]
1057
    movd       xmm3, [r0+r1*2-4]
1058
    pshufb     xmm2, xmm4
1059
    pshufb     xmm3, xmm4
1060
    psubw      xmm2, xmm5
1061
    psubw      xmm3, xmm5
1062
    paddw      xmm2, xmm0
1063
    paddw      xmm3, xmm0
1064
    packuswb   xmm2, xmm3
1065
    movq   [r0+r1*1], xmm2
1066
    movhps [r0+r1*2], xmm2
1067
    lea          r0, [r0+r1*2]
1068
    dec         r2d
1069
    jg .loop
1070
    REP_RET
1071

    
1072
; dest, left, right, src, tmp
1073
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1074
%macro PRED4x4_LOWPASS 5
1075
    mova    %5, %2
1076
    pavgb   %2, %3
1077
    pxor    %3, %5
1078
    mova    %1, %4
1079
    pand    %3, [pb_1]
1080
    psubusb %2, %3
1081
    pavgb   %1, %2
1082
%endmacro
1083

    
1084
;-----------------------------------------------------------------------------
1085
; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1086
;-----------------------------------------------------------------------------
1087
%ifdef CONFIG_GPL
1088
%macro PRED8x8L_TOP_DC 1
1089
cglobal pred8x8l_top_dc_%1, 4,4
1090
    sub          r0, r3
1091
    pxor        mm7, mm7
1092
    movq        mm0, [r0-8]
1093
    movq        mm3, [r0]
1094
    movq        mm1, [r0+8]
1095
    movq        mm2, mm3
1096
    movq        mm4, mm3
1097
    PALIGNR     mm2, mm0, 7, mm0
1098
    PALIGNR     mm1, mm4, 1, mm4
1099
    test         r1, r1 ; top_left
1100
    jz .fix_lt_2
1101
    test         r2, r2 ; top_right
1102
    jz .fix_tr_1
1103
    jmp .body
1104
.fix_lt_2:
1105
    movq        mm5, mm3
1106
    pxor        mm5, mm2
1107
    psllq       mm5, 56
1108
    psrlq       mm5, 56
1109
    pxor        mm2, mm5
1110
    test         r2, r2 ; top_right
1111
    jnz .body
1112
.fix_tr_1:
1113
    movq        mm5, mm3
1114
    pxor        mm5, mm1
1115
    psrlq       mm5, 56
1116
    psllq       mm5, 56
1117
    pxor        mm1, mm5
1118
.body
1119
    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1120
    psadbw   mm7, mm0
1121
    paddw    mm7, [pw_4]
1122
    psrlw    mm7, 3
1123
    pshufw   mm7, mm7, 0
1124
    packuswb mm7, mm7
1125
%rep 3
1126
    movq [r0+r3*1], mm7
1127
    movq [r0+r3*2], mm7
1128
    lea    r0, [r0+r3*2]
1129
%endrep
1130
    movq [r0+r3*1], mm7
1131
    movq [r0+r3*2], mm7
1132
    RET
1133
%endmacro
1134

    
1135
INIT_MMX
1136
%define PALIGNR PALIGNR_MMX
1137
PRED8x8L_TOP_DC mmxext
1138
%define PALIGNR PALIGNR_SSSE3
1139
PRED8x8L_TOP_DC ssse3
1140

    
1141
;-----------------------------------------------------------------------------
1142
;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1143
;-----------------------------------------------------------------------------
1144

    
1145
%macro PRED8x8L_DC 1
1146
cglobal pred8x8l_dc_%1, 4,5
1147
    sub          r0, r3
1148
    lea          r4, [r0+r3*2]
1149
    movq        mm0, [r0+r3*1-8]
1150
    punpckhbw   mm0, [r0+r3*0-8]
1151
    movq        mm1, [r4+r3*1-8]
1152
    punpckhbw   mm1, [r0+r3*2-8]
1153
    mov          r4, r0
1154
    punpckhwd   mm1, mm0
1155
    lea          r0, [r0+r3*4]
1156
    movq        mm2, [r0+r3*1-8]
1157
    punpckhbw   mm2, [r0+r3*0-8]
1158
    lea          r0, [r0+r3*2]
1159
    movq        mm3, [r0+r3*1-8]
1160
    punpckhbw   mm3, [r0+r3*0-8]
1161
    punpckhwd   mm3, mm2
1162
    punpckhdq   mm3, mm1
1163
    lea          r0, [r0+r3*2]
1164
    movq        mm0, [r0+r3*0-8]
1165
    movq        mm1, [r4]
1166
    mov          r0, r4
1167
    movq        mm4, mm3
1168
    movq        mm2, mm3
1169
    PALIGNR     mm4, mm0, 7, mm0
1170
    PALIGNR     mm1, mm2, 1, mm2
1171
    test        r1, r1
1172
    jnz .do_left
1173
.fix_lt_1:
1174
    movq        mm5, mm3
1175
    pxor        mm5, mm4
1176
    psrlq       mm5, 56
1177
    psllq       mm5, 48
1178
    pxor        mm1, mm5
1179
    jmp .do_left
1180
.fix_lt_2:
1181
    movq        mm5, mm3
1182
    pxor        mm5, mm2
1183
    psllq       mm5, 56
1184
    psrlq       mm5, 56
1185
    pxor        mm2, mm5
1186
    test         r2, r2
1187
    jnz .body
1188
.fix_tr_1:
1189
    movq        mm5, mm3
1190
    pxor        mm5, mm1
1191
    psrlq       mm5, 56
1192
    psllq       mm5, 56
1193
    pxor        mm1, mm5
1194
    jmp .body
1195
.do_left:
1196
    movq        mm0, mm4
1197
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1198
    movq        mm4, mm0
1199
    movq        mm7, mm2
1200
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1201
    psllq       mm1, 56
1202
    PALIGNR     mm7, mm1, 7, mm3
1203
    movq        mm0, [r0-8]
1204
    movq        mm3, [r0]
1205
    movq        mm1, [r0+8]
1206
    movq        mm2, mm3
1207
    movq        mm4, mm3
1208
    PALIGNR     mm2, mm0, 7, mm0
1209
    PALIGNR     mm1, mm4, 1, mm4
1210
    test         r1, r1
1211
    jz .fix_lt_2
1212
    test         r2, r2
1213
    jz .fix_tr_1
1214
.body
1215
    lea          r1, [r0+r3*2]
1216
    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1217
    pxor        mm0, mm0
1218
    pxor        mm1, mm1
1219
    lea          r2, [r1+r3*2]
1220
    psadbw      mm0, mm7
1221
    psadbw      mm1, mm6
1222
    paddw       mm0, [pw_8]
1223
    paddw       mm0, mm1
1224
    lea          r4, [r2+r3*2]
1225
    psrlw       mm0, 4
1226
    pshufw      mm0, mm0, 0
1227
    packuswb    mm0, mm0
1228
    movq [r0+r3*1], mm0
1229
    movq [r0+r3*2], mm0
1230
    movq [r1+r3*1], mm0
1231
    movq [r1+r3*2], mm0
1232
    movq [r2+r3*1], mm0
1233
    movq [r2+r3*2], mm0
1234
    movq [r4+r3*1], mm0
1235
    movq [r4+r3*2], mm0
1236
    RET
1237
%endmacro
1238
INIT_MMX
1239
%define PALIGNR PALIGNR_MMX
1240
PRED8x8L_DC mmxext
1241
%define PALIGNR PALIGNR_SSSE3
1242
PRED8x8L_DC ssse3
1243

    
1244
;-----------------------------------------------------------------------------
1245
; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
1246
;-----------------------------------------------------------------------------
1247

    
1248
%macro PRED8x8L_HORIZONTAL 1
1249
cglobal pred8x8l_horizontal_%1, 4,4
1250
    sub          r0, r3
1251
    lea          r2, [r0+r3*2]
1252
    movq        mm0, [r0+r3*1-8]
1253
    punpckhbw   mm0, [r0+r3*0-8]
1254
    movq        mm1, [r2+r3*1-8]
1255
    punpckhbw   mm1, [r0+r3*2-8]
1256
    mov          r2, r0
1257
    punpckhwd   mm1, mm0
1258
    lea          r0, [r0+r3*4]
1259
    movq        mm2, [r0+r3*1-8]
1260
    punpckhbw   mm2, [r0+r3*0-8]
1261
    lea          r0, [r0+r3*2]
1262
    movq        mm3, [r0+r3*1-8]
1263
    punpckhbw   mm3, [r0+r3*0-8]
1264
    punpckhwd   mm3, mm2
1265
    punpckhdq   mm3, mm1
1266
    lea          r0, [r0+r3*2]
1267
    movq        mm0, [r0+r3*0-8]
1268
    movq        mm1, [r2]
1269
    mov          r0, r2
1270
    movq        mm4, mm3
1271
    movq        mm2, mm3
1272
    PALIGNR     mm4, mm0, 7, mm0
1273
    PALIGNR     mm1, mm2, 1, mm2
1274
    test        r1, r1 ; top_left
1275
    jnz .do_left
1276
.fix_lt_1:
1277
    movq        mm5, mm3
1278
    pxor        mm5, mm4
1279
    psrlq       mm5, 56
1280
    psllq       mm5, 48
1281
    pxor        mm1, mm5
1282
.do_left:
1283
    movq        mm0, mm4
1284
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1285
    movq        mm4, mm0
1286
    movq        mm7, mm2
1287
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1288
    psllq       mm1, 56
1289
    PALIGNR     mm7, mm1, 7, mm3
1290
    movq        mm3, mm7
1291
    lea         r1, [r0+r3*2]
1292
    movq       mm7, mm3
1293
    punpckhbw  mm3, mm3
1294
    punpcklbw  mm7, mm7
1295
    pshufw     mm0, mm3, 0xff
1296
    pshufw     mm1, mm3, 0xaa
1297
    lea         r2, [r1+r3*2]
1298
    pshufw     mm2, mm3, 0x55
1299
    pshufw     mm3, mm3, 0x00
1300
    pshufw     mm4, mm7, 0xff
1301
    pshufw     mm5, mm7, 0xaa
1302
    pshufw     mm6, mm7, 0x55
1303
    pshufw     mm7, mm7, 0x00
1304
    movq [r0+r3*1], mm0
1305
    movq [r0+r3*2], mm1
1306
    movq [r1+r3*1], mm2
1307
    movq [r1+r3*2], mm3
1308
    movq [r2+r3*1], mm4
1309
    movq [r2+r3*2], mm5
1310
    lea         r0, [r2+r3*2]
1311
    movq [r0+r3*1], mm6
1312
    movq [r0+r3*2], mm7
1313
    RET
1314
%endmacro
1315

    
1316
INIT_MMX
1317
%define PALIGNR PALIGNR_MMX
1318
PRED8x8L_HORIZONTAL mmxext
1319
%define PALIGNR PALIGNR_SSSE3
1320
PRED8x8L_HORIZONTAL ssse3
1321

    
1322
;-----------------------------------------------------------------------------
1323
; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
1324
;-----------------------------------------------------------------------------
1325

    
1326
%macro PRED8x8L_VERTICAL 1
1327
cglobal pred8x8l_vertical_%1, 4,4
1328
    sub          r0, r3
1329
    movq        mm0, [r0-8]
1330
    movq        mm3, [r0]
1331
    movq        mm1, [r0+8]
1332
    movq        mm2, mm3
1333
    movq        mm4, mm3
1334
    PALIGNR     mm2, mm0, 7, mm0
1335
    PALIGNR     mm1, mm4, 1, mm4
1336
    test         r1, r1 ; top_left
1337
    jz .fix_lt_2
1338
    test         r2, r2 ; top_right
1339
    jz .fix_tr_1
1340
    jmp .body
1341
.fix_lt_2:
1342
    movq        mm5, mm3
1343
    pxor        mm5, mm2
1344
    psllq       mm5, 56
1345
    psrlq       mm5, 56
1346
    pxor        mm2, mm5
1347
    test         r2, r2 ; top_right
1348
    jnz .body
1349
.fix_tr_1:
1350
    movq        mm5, mm3
1351
    pxor        mm5, mm1
1352
    psrlq       mm5, 56
1353
    psllq       mm5, 56
1354
    pxor        mm1, mm5
1355
.body
1356
    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1357
%rep 3
1358
    movq [r0+r3*1], mm0
1359
    movq [r0+r3*2], mm0
1360
    lea    r0, [r0+r3*2]
1361
%endrep
1362
    movq [r0+r3*1], mm0
1363
    movq [r0+r3*2], mm0
1364
    RET
1365
%endmacro
1366

    
1367
INIT_MMX
1368
%define PALIGNR PALIGNR_MMX
1369
PRED8x8L_VERTICAL mmxext
1370
%define PALIGNR PALIGNR_SSSE3
1371
PRED8x8L_VERTICAL ssse3
1372

    
1373
;-----------------------------------------------------------------------------
1374
;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1375
;-----------------------------------------------------------------------------
1376

    
1377
%macro PRED8x8L_DOWN_LEFT 1
1378
cglobal pred8x8l_down_left_%1, 4,4
1379
    sub          r0, r3
1380
    movq        mm0, [r0-8]
1381
    movq        mm3, [r0]
1382
    movq        mm1, [r0+8]
1383
    movq        mm2, mm3
1384
    movq        mm4, mm3
1385
    PALIGNR     mm2, mm0, 7, mm0
1386
    PALIGNR     mm1, mm4, 1, mm4
1387
    test         r1, r1 ; top_left
1388
    jz .fix_lt_2
1389
    test         r2, r2 ; top_right
1390
    jz .fix_tr_1
1391
    jmp .do_top
1392
.fix_lt_2:
1393
    movq        mm5, mm3
1394
    pxor        mm5, mm2
1395
    psllq       mm5, 56
1396
    psrlq       mm5, 56
1397
    pxor        mm2, mm5
1398
    test         r2, r2 ; top_right
1399
    jnz .do_top
1400
.fix_tr_1:
1401
    movq        mm5, mm3
1402
    pxor        mm5, mm1
1403
    psrlq       mm5, 56
1404
    psllq       mm5, 56
1405
    pxor        mm1, mm5
1406
    jmp .do_top
1407
.fix_tr_2:
1408
    punpckhbw   mm3, mm3
1409
    pshufw      mm1, mm3, 0xFF
1410
    jmp .do_topright
1411
.do_top:
1412
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1413
    movq2dq    xmm3, mm4
1414
    test         r2, r2 ; top_right
1415
    jz .fix_tr_2
1416
    movq        mm0, [r0+8]
1417
    movq        mm5, mm0
1418
    movq        mm2, mm0
1419
    movq        mm4, mm0
1420
    psrlq       mm5, 56
1421
    PALIGNR     mm2, mm3, 7, mm3
1422
    PALIGNR     mm5, mm4, 1, mm4
1423
    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1424
.do_topright:
1425
    movq2dq    xmm4, mm1
1426
    psrlq       mm1, 56
1427
    movq2dq    xmm5, mm1
1428
    lea         r1, [r0+r3*2]
1429
    pslldq    xmm4, 8
1430
    por       xmm3, xmm4
1431
    movdqa    xmm2, xmm3
1432
    psrldq    xmm2, 1
1433
    pslldq    xmm5, 15
1434
    por       xmm2, xmm5
1435
    lea         r2, [r1+r3*2]
1436
    movdqa    xmm1, xmm3
1437
    pslldq    xmm1, 1
1438
INIT_XMM
1439
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1440
    psrldq    xmm0, 1
1441
    movq [r0+r3*1], xmm0
1442
    psrldq    xmm0, 1
1443
    movq [r0+r3*2], xmm0
1444
    psrldq    xmm0, 1
1445
    lea         r0, [r2+r3*2]
1446
    movq [r1+r3*1], xmm0
1447
    psrldq    xmm0, 1
1448
    movq [r1+r3*2], xmm0
1449
    psrldq    xmm0, 1
1450
    movq [r2+r3*1], xmm0
1451
    psrldq    xmm0, 1
1452
    movq [r2+r3*2], xmm0
1453
    psrldq    xmm0, 1
1454
    movq [r0+r3*1], xmm0
1455
    psrldq    xmm0, 1
1456
    movq [r0+r3*2], xmm0
1457
    RET
1458
%endmacro
1459

    
1460
INIT_MMX
1461
%define PALIGNR PALIGNR_MMX
1462
PRED8x8L_DOWN_LEFT sse2
1463
INIT_MMX
1464
%define PALIGNR PALIGNR_SSSE3
1465
PRED8x8L_DOWN_LEFT ssse3
1466

    
1467
;-----------------------------------------------------------------------------
1468
;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
1469
;-----------------------------------------------------------------------------
1470

    
1471
INIT_MMX
1472
%define PALIGNR PALIGNR_MMX
1473
cglobal pred8x8l_down_right_mmxext, 4,5
1474
    sub          r0, r3
1475
    lea          r4, [r0+r3*2]
1476
    movq        mm0, [r0+r3*1-8]
1477
    punpckhbw   mm0, [r0+r3*0-8]
1478
    movq        mm1, [r4+r3*1-8]
1479
    punpckhbw   mm1, [r0+r3*2-8]
1480
    mov          r4, r0
1481
    punpckhwd   mm1, mm0
1482
    lea          r0, [r0+r3*4]
1483
    movq        mm2, [r0+r3*1-8]
1484
    punpckhbw   mm2, [r0+r3*0-8]
1485
    lea          r0, [r0+r3*2]
1486
    movq        mm3, [r0+r3*1-8]
1487
    punpckhbw   mm3, [r0+r3*0-8]
1488
    punpckhwd   mm3, mm2
1489
    punpckhdq   mm3, mm1
1490
    lea          r0, [r0+r3*2]
1491
    movq        mm0, [r0+r3*0-8]
1492
    movq        mm1, [r4]
1493
    mov          r0, r4
1494
    movq        mm4, mm3
1495
    movq        mm2, mm3
1496
    PALIGNR     mm4, mm0, 7, mm0
1497
    PALIGNR     mm1, mm2, 1, mm2
1498
    test        r1, r1 ; top_left
1499
    jz .fix_lt_1
1500
.do_left:
1501
    movq        mm0, mm4
1502
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1503
    movq        mm4, mm0
1504
    movq        mm7, mm2
1505
    movq        mm6, mm2
1506
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1507
    psllq       mm1, 56
1508
    PALIGNR     mm7, mm1, 7, mm3
1509
    movq        mm0, [r0-8]
1510
    movq        mm3, [r0]
1511
    movq        mm1, [r0+8]
1512
    movq        mm2, mm3
1513
    movq        mm4, mm3
1514
    PALIGNR     mm2, mm0, 7, mm0
1515
    PALIGNR     mm1, mm4, 1, mm4
1516
    test         r1, r1 ; top_left
1517
    jz .fix_lt_2
1518
    test         r2, r2 ; top_right
1519
    jz .fix_tr_1
1520
.do_top:
1521
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1522
    movq        mm5, mm4
1523
    jmp .body
1524
.fix_lt_1:
1525
    movq        mm5, mm3
1526
    pxor        mm5, mm4
1527
    psrlq       mm5, 56
1528
    psllq       mm5, 48
1529
    pxor        mm1, mm5
1530
    jmp .do_left
1531
.fix_lt_2:
1532
    movq        mm5, mm3
1533
    pxor        mm5, mm2
1534
    psllq       mm5, 56
1535
    psrlq       mm5, 56
1536
    pxor        mm2, mm5
1537
    test         r2, r2 ; top_right
1538
    jnz .do_top
1539
.fix_tr_1:
1540
    movq        mm5, mm3
1541
    pxor        mm5, mm1
1542
    psrlq       mm5, 56
1543
    psllq       mm5, 56
1544
    pxor        mm1, mm5
1545
    jmp .do_top
1546
.body
1547
    lea         r1, [r0+r3*2]
1548
    movq       mm1, mm7
1549
    movq       mm7, mm5
1550
    movq       mm5, mm6
1551
    movq       mm2, mm7
1552
    lea         r2, [r1+r3*2]
1553
    PALIGNR    mm2, mm6, 1, mm0
1554
    movq       mm3, mm7
1555
    PALIGNR    mm3, mm6, 7, mm0
1556
    movq       mm4, mm7
1557
    lea         r4, [r2+r3*2]
1558
    psrlq      mm4, 8
1559
    PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1560
    PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1561
    movq [r4+r3*2], mm0
1562
    movq       mm2, mm1
1563
    psrlq      mm0, 8
1564
    psllq      mm2, 56
1565
    psrlq      mm1, 8
1566
    por        mm0, mm2
1567
    movq [r4+r3*1], mm0
1568
    movq       mm2, mm1
1569
    psrlq      mm0, 8
1570
    psllq      mm2, 56
1571
    psrlq      mm1, 8
1572
    por        mm0, mm2
1573
    movq [r2+r3*2], mm0
1574
    movq       mm2, mm1
1575
    psrlq      mm0, 8
1576
    psllq      mm2, 56
1577
    psrlq      mm1, 8
1578
    por        mm0, mm2
1579
    movq [r2+r3*1], mm0
1580
    movq       mm2, mm1
1581
    psrlq      mm0, 8
1582
    psllq      mm2, 56
1583
    psrlq      mm1, 8
1584
    por        mm0, mm2
1585
    movq [r1+r3*2], mm0
1586
    movq       mm2, mm1
1587
    psrlq      mm0, 8
1588
    psllq      mm2, 56
1589
    psrlq      mm1, 8
1590
    por        mm0, mm2
1591
    movq [r1+r3*1], mm0
1592
    movq       mm2, mm1
1593
    psrlq      mm0, 8
1594
    psllq      mm2, 56
1595
    psrlq      mm1, 8
1596
    por        mm0, mm2
1597
    movq [r0+r3*2], mm0
1598
    psrlq      mm0, 8
1599
    psllq      mm1, 56
1600
    por        mm0, mm1
1601
    movq [r0+r3*1], mm0
1602
    RET
1603

    
1604
%macro PRED8x8L_DOWN_RIGHT 1
1605
cglobal pred8x8l_down_right_%1, 4,5
1606
    sub          r0, r3
1607
    lea          r4, [r0+r3*2]
1608
    movq        mm0, [r0+r3*1-8]
1609
    punpckhbw   mm0, [r0+r3*0-8]
1610
    movq        mm1, [r4+r3*1-8]
1611
    punpckhbw   mm1, [r0+r3*2-8]
1612
    mov          r4, r0
1613
    punpckhwd   mm1, mm0
1614
    lea          r0, [r0+r3*4]
1615
    movq        mm2, [r0+r3*1-8]
1616
    punpckhbw   mm2, [r0+r3*0-8]
1617
    lea          r0, [r0+r3*2]
1618
    movq        mm3, [r0+r3*1-8]
1619
    punpckhbw   mm3, [r0+r3*0-8]
1620
    punpckhwd   mm3, mm2
1621
    punpckhdq   mm3, mm1
1622
    lea          r0, [r0+r3*2]
1623
    movq        mm0, [r0+r3*0-8]
1624
    movq        mm1, [r4]
1625
    mov          r0, r4
1626
    movq        mm4, mm3
1627
    movq        mm2, mm3
1628
    PALIGNR     mm4, mm0, 7, mm0
1629
    PALIGNR     mm1, mm2, 1, mm2
1630
    test        r1, r1
1631
    jz .fix_lt_1
1632
    jmp .do_left
1633
.fix_lt_1:
1634
    movq        mm5, mm3
1635
    pxor        mm5, mm4
1636
    psrlq       mm5, 56
1637
    psllq       mm5, 48
1638
    pxor        mm1, mm5
1639
    jmp .do_left
1640
.fix_lt_2:
1641
    movq        mm5, mm3
1642
    pxor        mm5, mm2
1643
    psllq       mm5, 56
1644
    psrlq       mm5, 56
1645
    pxor        mm2, mm5
1646
    test         r2, r2
1647
    jnz .do_top
1648
.fix_tr_1:
1649
    movq        mm5, mm3
1650
    pxor        mm5, mm1
1651
    psrlq       mm5, 56
1652
    psllq       mm5, 56
1653
    pxor        mm1, mm5
1654
    jmp .do_top
1655
.do_left:
1656
    movq        mm0, mm4
1657
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1658
    movq        mm4, mm0
1659
    movq        mm7, mm2
1660
    movq2dq    xmm3, mm2
1661
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1662
    psllq       mm1, 56
1663
    PALIGNR     mm7, mm1, 7, mm3
1664
    movq2dq    xmm1, mm7
1665
    movq        mm0, [r0-8]
1666
    movq        mm3, [r0]
1667
    movq        mm1, [r0+8]
1668
    movq        mm2, mm3
1669
    movq        mm4, mm3
1670
    PALIGNR     mm2, mm0, 7, mm0
1671
    PALIGNR     mm1, mm4, 1, mm4
1672
    test         r1, r1
1673
    jz .fix_lt_2
1674
    test         r2, r2
1675
    jz .fix_tr_1
1676
.do_top:
1677
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1678
    movq2dq   xmm4, mm4
1679
    lea         r1, [r0+r3*2]
1680
    movdqa    xmm0, xmm3
1681
    pslldq    xmm4, 8
1682
    por       xmm3, xmm4
1683
    lea         r2, [r1+r3*2]
1684
    pslldq    xmm4, 1
1685
    por       xmm1, xmm4
1686
    psrldq    xmm0, 7
1687
    pslldq    xmm0, 15
1688
    psrldq    xmm0, 7
1689
    por       xmm1, xmm0
1690
    lea         r0, [r2+r3*2]
1691
    movdqa    xmm2, xmm3
1692
    psrldq    xmm2, 1
1693
INIT_XMM
1694
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1695
    movdqa    xmm1, xmm0
1696
    psrldq    xmm1, 1
1697
    movq [r0+r3*2], xmm0
1698
    movq [r0+r3*1], xmm1
1699
    psrldq    xmm0, 2
1700
    psrldq    xmm1, 2
1701
    movq [r2+r3*2], xmm0
1702
    movq [r2+r3*1], xmm1
1703
    psrldq    xmm0, 2
1704
    psrldq    xmm1, 2
1705
    movq [r1+r3*2], xmm0
1706
    movq [r1+r3*1], xmm1
1707
    psrldq    xmm0, 2
1708
    psrldq    xmm1, 2
1709
    movq [r4+r3*2], xmm0
1710
    movq [r4+r3*1], xmm1
1711
    RET
1712
%endmacro
1713

    
1714
INIT_MMX
1715
%define PALIGNR PALIGNR_MMX
1716
PRED8x8L_DOWN_RIGHT sse2
1717
INIT_MMX
1718
%define PALIGNR PALIGNR_SSSE3
1719
PRED8x8L_DOWN_RIGHT ssse3
1720

    
1721
;-----------------------------------------------------------------------------
1722
; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
1723
;-----------------------------------------------------------------------------
1724

    
1725
INIT_MMX
1726
%define PALIGNR PALIGNR_MMX
1727
cglobal pred8x8l_vertical_right_mmxext, 4,5
1728
    sub          r0, r3
1729
    lea          r4, [r0+r3*2]
1730
    movq        mm0, [r0+r3*1-8]
1731
    punpckhbw   mm0, [r0+r3*0-8]
1732
    movq        mm1, [r4+r3*1-8]
1733
    punpckhbw   mm1, [r0+r3*2-8]
1734
    mov          r4, r0
1735
    punpckhwd   mm1, mm0
1736
    lea          r0, [r0+r3*4]
1737
    movq        mm2, [r0+r3*1-8]
1738
    punpckhbw   mm2, [r0+r3*0-8]
1739
    lea          r0, [r0+r3*2]
1740
    movq        mm3, [r0+r3*1-8]
1741
    punpckhbw   mm3, [r0+r3*0-8]
1742
    punpckhwd   mm3, mm2
1743
    punpckhdq   mm3, mm1
1744
    lea          r0, [r0+r3*2]
1745
    movq        mm0, [r0+r3*0-8]
1746
    movq        mm1, [r4]
1747
    mov          r0, r4
1748
    movq        mm4, mm3
1749
    movq        mm2, mm3
1750
    PALIGNR     mm4, mm0, 7, mm0
1751
    PALIGNR     mm1, mm2, 1, mm2
1752
    test        r1, r1
1753
    jz .fix_lt_1
1754
    jmp .do_left
1755
.fix_lt_1:
1756
    movq        mm5, mm3
1757
    pxor        mm5, mm4
1758
    psrlq       mm5, 56
1759
    psllq       mm5, 48
1760
    pxor        mm1, mm5
1761
    jmp .do_left
1762
.fix_lt_2:
1763
    movq        mm5, mm3
1764
    pxor        mm5, mm2
1765
    psllq       mm5, 56
1766
    psrlq       mm5, 56
1767
    pxor        mm2, mm5
1768
    test         r2, r2
1769
    jnz .do_top
1770
.fix_tr_1:
1771
    movq        mm5, mm3
1772
    pxor        mm5, mm1
1773
    psrlq       mm5, 56
1774
    psllq       mm5, 56
1775
    pxor        mm1, mm5
1776
    jmp .do_top
1777
.do_left:
1778
    movq        mm0, mm4
1779
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1780
    movq        mm7, mm2
1781
    movq        mm0, [r0-8]
1782
    movq        mm3, [r0]
1783
    movq        mm1, [r0+8]
1784
    movq        mm2, mm3
1785
    movq        mm4, mm3
1786
    PALIGNR     mm2, mm0, 7, mm0
1787
    PALIGNR     mm1, mm4, 1, mm4
1788
    test         r1, r1
1789
    jz .fix_lt_2
1790
    test         r2, r2
1791
    jz .fix_tr_1
1792
.do_top
1793
    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1794
    lea         r1, [r0+r3*2]
1795
    movq       mm2, mm6
1796
    movq       mm3, mm6
1797
    PALIGNR    mm3, mm7, 7, mm0
1798
    PALIGNR    mm6, mm7, 6, mm1
1799
    movq       mm4, mm3
1800
    pavgb      mm3, mm2
1801
    lea         r2, [r1+r3*2]
1802
    PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1803
    movq [r0+r3*1], mm3
1804
    movq [r0+r3*2], mm0
1805
    movq       mm5, mm0
1806
    movq       mm6, mm3
1807
    movq       mm1, mm7
1808
    movq       mm2, mm1
1809
    psllq      mm2, 8
1810
    movq       mm3, mm1
1811
    psllq      mm3, 16
1812
    lea         r4, [r2+r3*2]
1813
    PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1814
    PALIGNR    mm6, mm0, 7, mm2
1815
    movq [r1+r3*1], mm6
1816
    psllq      mm0, 8
1817
    PALIGNR    mm5, mm0, 7, mm1
1818
    movq [r1+r3*2], mm5
1819
    psllq      mm0, 8
1820
    PALIGNR    mm6, mm0, 7, mm2
1821
    movq [r2+r3*1], mm6
1822
    psllq      mm0, 8
1823
    PALIGNR    mm5, mm0, 7, mm1
1824
    movq [r2+r3*2], mm5
1825
    psllq      mm0, 8
1826
    PALIGNR    mm6, mm0, 7, mm2
1827
    movq [r4+r3*1], mm6
1828
    psllq      mm0, 8
1829
    PALIGNR    mm5, mm0, 7, mm1
1830
    movq [r4+r3*2], mm5
1831
    RET
1832

    
1833
%macro PRED8x8L_VERTICAL_RIGHT 1
1834
cglobal pred8x8l_vertical_right_%1, 4,5,7
1835
    sub          r0, r3
1836
    lea          r4, [r0+r3*2]
1837
    movq        mm0, [r0+r3*1-8]
1838
    punpckhbw   mm0, [r0+r3*0-8]
1839
    movq        mm1, [r4+r3*1-8]
1840
    punpckhbw   mm1, [r0+r3*2-8]
1841
    mov          r4, r0
1842
    punpckhwd   mm1, mm0
1843
    lea          r0, [r0+r3*4]
1844
    movq        mm2, [r0+r3*1-8]
1845
    punpckhbw   mm2, [r0+r3*0-8]
1846
    lea          r0, [r0+r3*2]
1847
    movq        mm3, [r0+r3*1-8]
1848
    punpckhbw   mm3, [r0+r3*0-8]
1849
    punpckhwd   mm3, mm2
1850
    punpckhdq   mm3, mm1
1851
    lea          r0, [r0+r3*2]
1852
    movq        mm0, [r0+r3*0-8]
1853
    movq        mm1, [r4]
1854
    mov          r0, r4
1855
    movq        mm4, mm3
1856
    movq        mm2, mm3
1857
    PALIGNR     mm4, mm0, 7, mm0
1858
    PALIGNR     mm1, mm2, 1, mm2
1859
    test        r1, r1
1860
    jnz .do_left
1861
.fix_lt_1:
1862
    movq        mm5, mm3
1863
    pxor        mm5, mm4
1864
    psrlq       mm5, 56
1865
    psllq       mm5, 48
1866
    pxor        mm1, mm5
1867
    jmp .do_left
1868
.fix_lt_2:
1869
    movq        mm5, mm3
1870
    pxor        mm5, mm2
1871
    psllq       mm5, 56
1872
    psrlq       mm5, 56
1873
    pxor        mm2, mm5
1874
    test         r2, r2
1875
    jnz .do_top
1876
.fix_tr_1:
1877
    movq        mm5, mm3
1878
    pxor        mm5, mm1
1879
    psrlq       mm5, 56
1880
    psllq       mm5, 56
1881
    pxor        mm1, mm5
1882
    jmp .do_top
1883
.do_left:
1884
    movq        mm0, mm4
1885
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1886
    movq2dq    xmm0, mm2
1887
    movq        mm0, [r0-8]
1888
    movq        mm3, [r0]
1889
    movq        mm1, [r0+8]
1890
    movq        mm2, mm3
1891
    movq        mm4, mm3
1892
    PALIGNR     mm2, mm0, 7, mm0
1893
    PALIGNR     mm1, mm4, 1, mm4
1894
    test         r1, r1
1895
    jz .fix_lt_2
1896
    test         r2, r2
1897
    jz .fix_tr_1
1898
.do_top
1899
    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1900
    lea           r1, [r0+r3*2]
1901
    movq2dq     xmm4, mm6
1902
    pslldq      xmm4, 8
1903
    por         xmm0, xmm4
1904
    movdqa      xmm6, [pw_ff00]
1905
    movdqa      xmm1, xmm0
1906
    lea           r2, [r1+r3*2]
1907
    movdqa      xmm2, xmm0
1908
    movdqa      xmm3, xmm0
1909
    pslldq      xmm0, 1
1910
    pslldq      xmm1, 2
1911
    pavgb       xmm2, xmm0
1912
INIT_XMM
1913
    PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
1914
    pandn       xmm6, xmm4
1915
    movdqa      xmm5, xmm4
1916
    psrlw       xmm4, 8
1917
    packuswb    xmm6, xmm4
1918
    movhlps     xmm4, xmm6
1919
    movhps [r0+r3*2], xmm5
1920
    movhps [r0+r3*1], xmm2
1921
    psrldq      xmm5, 4
1922
    movss       xmm5, xmm6
1923
    psrldq      xmm2, 4
1924
    movss       xmm2, xmm4
1925
    lea           r0, [r2+r3*2]
1926
    psrldq      xmm5, 1
1927
    psrldq      xmm2, 1
1928
    movq        [r0+r3*2], xmm5
1929
    movq        [r0+r3*1], xmm2
1930
    psrldq      xmm5, 1
1931
    psrldq      xmm2, 1
1932
    movq        [r2+r3*2], xmm5
1933
    movq        [r2+r3*1], xmm2
1934
    psrldq      xmm5, 1
1935
    psrldq      xmm2, 1
1936
    movq        [r1+r3*2], xmm5
1937
    movq        [r1+r3*1], xmm2
1938
    RET
1939
%endmacro
1940

    
1941
INIT_MMX
1942
%define PALIGNR PALIGNR_MMX
1943
PRED8x8L_VERTICAL_RIGHT sse2
1944
INIT_MMX
1945
%define PALIGNR PALIGNR_SSSE3
1946
PRED8x8L_VERTICAL_RIGHT ssse3
1947

    
1948
;-----------------------------------------------------------------------------
1949
;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1950
;-----------------------------------------------------------------------------
1951

    
1952
%macro PRED8x8L_VERTICAL_LEFT 1
1953
cglobal pred8x8l_vertical_left_%1, 4,4
1954
    sub          r0, r3
1955
    movq        mm0, [r0-8]
1956
    movq        mm3, [r0]
1957
    movq        mm1, [r0+8]
1958
    movq        mm2, mm3
1959
    movq        mm4, mm3
1960
    PALIGNR     mm2, mm0, 7, mm0
1961
    PALIGNR     mm1, mm4, 1, mm4
1962
    test         r1, r1
1963
    jz .fix_lt_2
1964
    test         r2, r2
1965
    jz .fix_tr_1
1966
    jmp .do_top
1967
.fix_lt_2:
1968
    movq        mm5, mm3
1969
    pxor        mm5, mm2
1970
    psllq       mm5, 56
1971
    psrlq       mm5, 56
1972
    pxor        mm2, mm5
1973
    test         r2, r2
1974
    jnz .do_top
1975
.fix_tr_1:
1976
    movq        mm5, mm3
1977
    pxor        mm5, mm1
1978
    psrlq       mm5, 56
1979
    psllq       mm5, 56
1980
    pxor        mm1, mm5
1981
    jmp .do_top
1982
.fix_tr_2:
1983
    punpckhbw   mm3, mm3
1984
    pshufw      mm1, mm3, 0xFF
1985
    jmp .do_topright
1986
.do_top:
1987
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1988
    movq2dq    xmm4, mm4
1989
    test         r2, r2
1990
    jz .fix_tr_2
1991
    movq        mm0, [r0+8]
1992
    movq        mm5, mm0
1993
    movq        mm2, mm0
1994
    movq        mm4, mm0
1995
    psrlq       mm5, 56
1996
    PALIGNR     mm2, mm3, 7, mm3
1997
    PALIGNR     mm5, mm4, 1, mm4
1998
    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1999
.do_topright:
2000
    movq2dq   xmm3, mm1
2001
    lea         r1, [r0+r3*2]
2002
    pslldq    xmm3, 8
2003
    por       xmm4, xmm3
2004
    movdqa    xmm2, xmm4
2005
    movdqa    xmm1, xmm4
2006
    movdqa    xmm3, xmm4
2007
    psrldq    xmm2, 1
2008
    pslldq    xmm1, 1
2009
    pavgb     xmm3, xmm2
2010
    lea         r2, [r1+r3*2]
2011
INIT_XMM
2012
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2013
    psrldq    xmm0, 1
2014
    movq [r0+r3*1], xmm3
2015
    movq [r0+r3*2], xmm0
2016
    lea         r0, [r2+r3*2]
2017
    psrldq    xmm3, 1
2018
    psrldq    xmm0, 1
2019
    movq [r1+r3*1], xmm3
2020
    movq [r1+r3*2], xmm0
2021
    psrldq    xmm3, 1
2022
    psrldq    xmm0, 1
2023
    movq [r2+r3*1], xmm3
2024
    movq [r2+r3*2], xmm0
2025
    psrldq    xmm3, 1
2026
    psrldq    xmm0, 1
2027
    movq [r0+r3*1], xmm3
2028
    movq [r0+r3*2], xmm0
2029
    RET
2030
%endmacro
2031

    
2032
INIT_MMX
2033
%define PALIGNR PALIGNR_MMX
2034
PRED8x8L_VERTICAL_LEFT sse2
2035
%define PALIGNR PALIGNR_SSSE3
2036
INIT_MMX
2037
PRED8x8L_VERTICAL_LEFT ssse3
2038

    
2039
;-----------------------------------------------------------------------------
2040
; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
2041
;-----------------------------------------------------------------------------
2042

    
2043
%macro PRED8x8L_HORIZONTAL_UP 1
2044
cglobal pred8x8l_horizontal_up_%1, 4,4
2045
    sub          r0, r3
2046
    lea          r2, [r0+r3*2]
2047
    movq        mm0, [r0+r3*1-8]
2048
    punpckhbw   mm0, [r0+r3*0-8]
2049
    movq        mm1, [r2+r3*1-8]
2050
    punpckhbw   mm1, [r0+r3*2-8]
2051
    mov          r2, r0
2052
    punpckhwd   mm1, mm0
2053
    lea          r0, [r0+r3*4]
2054
    movq        mm2, [r0+r3*1-8]
2055
    punpckhbw   mm2, [r0+r3*0-8]
2056
    lea          r0, [r0+r3*2]
2057
    movq        mm3, [r0+r3*1-8]
2058
    punpckhbw   mm3, [r0+r3*0-8]
2059
    punpckhwd   mm3, mm2
2060
    punpckhdq   mm3, mm1
2061
    lea          r0, [r0+r3*2]
2062
    movq        mm0, [r0+r3*0-8]
2063
    movq        mm1, [r2]
2064
    mov          r0, r2
2065
    movq        mm4, mm3
2066
    movq        mm2, mm3
2067
    PALIGNR     mm4, mm0, 7, mm0
2068
    PALIGNR     mm1, mm2, 1, mm2
2069
    test        r1, r1
2070
    jnz .do_left
2071
.fix_lt_1:
2072
    movq        mm5, mm3
2073
    pxor        mm5, mm4
2074
    psrlq       mm5, 56
2075
    psllq       mm5, 48
2076
    pxor        mm1, mm5
2077
.do_left:
2078
    movq       mm0, mm4
2079
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2080
    movq       mm4, mm0
2081
    movq       mm7, mm2
2082
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2083
    psllq      mm1, 56
2084
    PALIGNR    mm7, mm1, 7, mm3
2085
    lea         r1, [r0+r3*2]
2086
    pshufw     mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2087
    psllq      mm7, 56             ; l7 .. .. .. .. .. .. ..
2088
    movq       mm2, mm0
2089
    psllw      mm0, 8
2090
    psrlw      mm2, 8
2091
    por        mm2, mm0            ; l7 l6 l5 l4 l3 l2 l1 l0
2092
    movq       mm3, mm2
2093
    movq       mm4, mm2
2094
    movq       mm5, mm2
2095
    psrlq      mm2, 8
2096
    psrlq      mm3, 16
2097
    lea         r2, [r1+r3*2]
2098
    por        mm2, mm7            ; l7 l7 l6 l5 l4 l3 l2 l1
2099
    punpckhbw  mm7, mm7
2100
    por        mm3, mm7            ; l7 l7 l7 l6 l5 l4 l3 l2
2101
    pavgb      mm4, mm2
2102
    PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2103
    movq       mm5, mm4
2104
    punpcklbw  mm4, mm1            ; p4 p3 p2 p1
2105
    punpckhbw  mm5, mm1            ; p8 p7 p6 p5
2106
    movq       mm6, mm5
2107
    movq       mm7, mm5
2108
    movq       mm0, mm5
2109
    PALIGNR    mm5, mm4, 2, mm1
2110
    pshufw     mm1, mm6, 11111001b
2111
    PALIGNR    mm6, mm4, 4, mm2
2112
    pshufw     mm2, mm7, 11111110b
2113
    PALIGNR    mm7, mm4, 6, mm3
2114
    pshufw     mm3, mm0, 11111111b
2115
    movq [r0+r3*1], mm4
2116
    movq [r0+r3*2], mm5
2117
    lea         r0, [r2+r3*2]
2118
    movq [r1+r3*1], mm6
2119
    movq [r1+r3*2], mm7
2120
    movq [r2+r3*1], mm0
2121
    movq [r2+r3*2], mm1
2122
    movq [r0+r3*1], mm2
2123
    movq [r0+r3*2], mm3
2124
    RET
2125
%endmacro
2126

    
2127
INIT_MMX
2128
%define PALIGNR PALIGNR_MMX
2129
PRED8x8L_HORIZONTAL_UP mmxext
2130
%define PALIGNR PALIGNR_SSSE3
2131
PRED8x8L_HORIZONTAL_UP ssse3
2132

    
2133
;-----------------------------------------------------------------------------
2134
;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
2135
;-----------------------------------------------------------------------------
2136

    
2137
INIT_MMX
2138
%define PALIGNR PALIGNR_MMX
2139
cglobal pred8x8l_horizontal_down_mmxext, 4,5
2140
    sub          r0, r3
2141
    lea          r4, [r0+r3*2]
2142
    movq        mm0, [r0+r3*1-8]
2143
    punpckhbw   mm0, [r0+r3*0-8]
2144
    movq        mm1, [r4+r3*1-8]
2145
    punpckhbw   mm1, [r0+r3*2-8]
2146
    mov          r4, r0
2147
    punpckhwd   mm1, mm0
2148
    lea          r0, [r0+r3*4]
2149
    movq        mm2, [r0+r3*1-8]
2150
    punpckhbw   mm2, [r0+r3*0-8]
2151
    lea          r0, [r0+r3*2]
2152
    movq        mm3, [r0+r3*1-8]
2153
    punpckhbw   mm3, [r0+r3*0-8]
2154
    punpckhwd   mm3, mm2
2155
    punpckhdq   mm3, mm1
2156
    lea          r0, [r0+r3*2]
2157
    movq        mm0, [r0+r3*0-8]
2158
    movq        mm1, [r4]
2159
    mov          r0, r4
2160
    movq        mm4, mm3
2161
    movq        mm2, mm3
2162
    PALIGNR     mm4, mm0, 7, mm0
2163
    PALIGNR     mm1, mm2, 1, mm2
2164
    test        r1, r1
2165
    jnz .do_left
2166
.fix_lt_1:
2167
    movq        mm5, mm3
2168
    pxor        mm5, mm4
2169
    psrlq       mm5, 56
2170
    psllq       mm5, 48
2171
    pxor        mm1, mm5
2172
    jmp .do_left
2173
.fix_lt_2:
2174
    movq        mm5, mm3
2175
    pxor        mm5, mm2
2176
    psllq       mm5, 56
2177
    psrlq       mm5, 56
2178
    pxor        mm2, mm5
2179
    test         r2, r2
2180
    jnz .do_top
2181
.fix_tr_1:
2182
    movq        mm5, mm3
2183
    pxor        mm5, mm1
2184
    psrlq       mm5, 56
2185
    psllq       mm5, 56
2186
    pxor        mm1, mm5
2187
    jmp .do_top
2188
.do_left:
2189
    movq        mm0, mm4
2190
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2191
    movq        mm4, mm0
2192
    movq        mm7, mm2
2193
    movq        mm6, mm2
2194
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2195
    psllq       mm1, 56
2196
    PALIGNR     mm7, mm1, 7, mm3
2197
    movq        mm0, [r0-8]
2198
    movq        mm3, [r0]
2199
    movq        mm1, [r0+8]
2200
    movq        mm2, mm3
2201
    movq        mm4, mm3
2202
    PALIGNR     mm2, mm0, 7, mm0
2203
    PALIGNR     mm1, mm4, 1, mm4
2204
    test         r1, r1
2205
    jz .fix_lt_2
2206
    test         r2, r2
2207
    jz .fix_tr_1
2208
.do_top:
2209
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2210
    movq       mm5, mm4
2211
    lea         r1, [r0+r3*2]
2212
    psllq      mm7, 56
2213
    movq       mm2, mm5
2214
    movq       mm3, mm6
2215
    movq       mm4, mm2
2216
    PALIGNR    mm2, mm6, 7, mm5
2217
    PALIGNR    mm6, mm7, 7, mm0
2218
    lea         r2, [r1+r3*2]
2219
    PALIGNR    mm4, mm3, 1, mm7
2220
    movq       mm5, mm3
2221
    pavgb      mm3, mm6
2222
    PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2223
    movq       mm4, mm2
2224
    movq       mm1, mm2
2225
    lea         r4, [r2+r3*2]
2226
    psrlq      mm4, 16
2227
    psrlq      mm1, 8
2228
    PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2229
    movq       mm7, mm3
2230
    punpcklbw  mm3, mm0
2231
    punpckhbw  mm7, mm0
2232
    movq       mm1, mm7
2233
    movq       mm0, mm7
2234
    movq       mm4, mm7
2235
    movq [r4+r3*2], mm3
2236
    PALIGNR    mm7, mm3, 2, mm5
2237
    movq [r4+r3*1], mm7
2238
    PALIGNR    mm1, mm3, 4, mm5
2239
    movq [r2+r3*2], mm1
2240
    PALIGNR    mm0, mm3, 6, mm3
2241
    movq [r2+r3*1], mm0
2242
    movq       mm2, mm6
2243
    movq       mm3, mm6
2244
    movq [r1+r3*2], mm4
2245
    PALIGNR    mm6, mm4, 2, mm5
2246
    movq [r1+r3*1], mm6
2247
    PALIGNR    mm2, mm4, 4, mm5
2248
    movq [r0+r3*2], mm2
2249
    PALIGNR    mm3, mm4, 6, mm4
2250
    movq [r0+r3*1], mm3
2251
    RET
2252

    
2253
%macro PRED8x8L_HORIZONTAL_DOWN 1
2254
cglobal pred8x8l_horizontal_down_%1, 4,5
2255
    sub          r0, r3
2256
    lea          r4, [r0+r3*2]
2257
    movq        mm0, [r0+r3*1-8]
2258
    punpckhbw   mm0, [r0+r3*0-8]
2259
    movq        mm1, [r4+r3*1-8]
2260
    punpckhbw   mm1, [r0+r3*2-8]
2261
    mov          r4, r0
2262
    punpckhwd   mm1, mm0
2263
    lea          r0, [r0+r3*4]
2264
    movq        mm2, [r0+r3*1-8]
2265
    punpckhbw   mm2, [r0+r3*0-8]
2266
    lea          r0, [r0+r3*2]
2267
    movq        mm3, [r0+r3*1-8]
2268
    punpckhbw   mm3, [r0+r3*0-8]
2269
    punpckhwd   mm3, mm2
2270
    punpckhdq   mm3, mm1
2271
    lea          r0, [r0+r3*2]
2272
    movq        mm0, [r0+r3*0-8]
2273
    movq        mm1, [r4]
2274
    mov          r0, r4
2275
    movq        mm4, mm3
2276
    movq        mm2, mm3
2277
    PALIGNR     mm4, mm0, 7, mm0
2278
    PALIGNR     mm1, mm2, 1, mm2
2279
    test        r1, r1
2280
    jnz .do_left
2281
.fix_lt_1:
2282
    movq        mm5, mm3
2283
    pxor        mm5, mm4
2284
    psrlq       mm5, 56
2285
    psllq       mm5, 48
2286
    pxor        mm1, mm5
2287
    jmp .do_left
2288
.fix_lt_2:
2289
    movq        mm5, mm3
2290
    pxor        mm5, mm2
2291
    psllq       mm5, 56
2292
    psrlq       mm5, 56
2293
    pxor        mm2, mm5
2294
    test         r2, r2
2295
    jnz .do_top
2296
.fix_tr_1:
2297
    movq        mm5, mm3
2298
    pxor        mm5, mm1
2299
    psrlq       mm5, 56
2300
    psllq       mm5, 56
2301
    pxor        mm1, mm5
2302
    jmp .do_top
2303
.fix_tr_2:
2304
    punpckhbw   mm3, mm3
2305
    pshufw      mm1, mm3, 0xFF
2306
    jmp .do_topright
2307
.do_left:
2308
    movq        mm0, mm4
2309
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2310
    movq2dq    xmm0, mm2
2311
    pslldq     xmm0, 8
2312
    movq        mm4, mm0
2313
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2314
    movq2dq    xmm2, mm1
2315
    pslldq     xmm2, 15
2316
    psrldq     xmm2, 8
2317
    por        xmm0, xmm2
2318
    movq        mm0, [r0-8]
2319
    movq        mm3, [r0]
2320
    movq        mm1, [r0+8]
2321
    movq        mm2, mm3
2322
    movq        mm4, mm3
2323
    PALIGNR     mm2, mm0, 7, mm0
2324
    PALIGNR     mm1, mm4, 1, mm4
2325
    test         r1, r1
2326
    jz .fix_lt_2
2327
    test         r2, r2
2328
    jz .fix_tr_1
2329
.do_top:
2330
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2331
    movq2dq    xmm1, mm4
2332
    test         r2, r2
2333
    jz .fix_tr_2
2334
    movq        mm0, [r0+8]
2335
    movq        mm5, mm0
2336
    movq        mm2, mm0
2337
    movq        mm4, mm0
2338
    psrlq       mm5, 56
2339
    PALIGNR     mm2, mm3, 7, mm3
2340
    PALIGNR     mm5, mm4, 1, mm4
2341
    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2342
.do_topright:
2343
    movq2dq    xmm5, mm1
2344
    pslldq     xmm5, 8
2345
    por        xmm1, xmm5
2346
INIT_XMM
2347
    lea         r2, [r4+r3*2]
2348
    movdqa    xmm2, xmm1
2349
    movdqa    xmm3, xmm1
2350
    PALIGNR   xmm1, xmm0, 7, xmm4
2351
    PALIGNR   xmm2, xmm0, 9, xmm5
2352
    lea         r1, [r2+r3*2]
2353
    PALIGNR   xmm3, xmm0, 8, xmm0
2354
    movdqa    xmm4, xmm1
2355
    pavgb     xmm4, xmm3
2356
    lea         r0, [r1+r3*2]
2357
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2358
    punpcklbw xmm4, xmm0
2359
    movhlps   xmm0, xmm4
2360
    movq   [r0+r3*2], xmm4
2361
    movq   [r2+r3*2], xmm0
2362
    psrldq xmm4, 2
2363
    psrldq xmm0, 2
2364
    movq   [r0+r3*1], xmm4
2365
    movq   [r2+r3*1], xmm0
2366
    psrldq xmm4, 2
2367
    psrldq xmm0, 2
2368
    movq   [r1+r3*2], xmm4
2369
    movq   [r4+r3*2], xmm0
2370
    psrldq xmm4, 2
2371
    psrldq xmm0, 2
2372
    movq   [r1+r3*1], xmm4
2373
    movq   [r4+r3*1], xmm0
2374
    RET
2375
%endmacro
2376

    
2377
INIT_MMX
2378
%define PALIGNR PALIGNR_MMX
2379
PRED8x8L_HORIZONTAL_DOWN sse2
2380
INIT_MMX
2381
%define PALIGNR PALIGNR_SSSE3
2382
PRED8x8L_HORIZONTAL_DOWN ssse3
2383
%endif
2384

    
2385
;-----------------------------------------------------------------------------
2386
; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2387
;-----------------------------------------------------------------------------
2388

    
2389
cglobal pred4x4_dc_mmxext, 3,5
2390
    pxor   mm7, mm7
2391
    mov     r4, r0
2392
    sub     r0, r2
2393
    movd   mm0, [r0]
2394
    psadbw mm0, mm7
2395
    movzx  r1d, byte [r0+r2*1-1]
2396
    movd   r3d, mm0
2397
    add    r3d, r1d
2398
    movzx  r1d, byte [r0+r2*2-1]
2399
    lea     r0, [r0+r2*2]
2400
    add    r3d, r1d
2401
    movzx  r1d, byte [r0+r2*1-1]
2402
    add    r3d, r1d
2403
    movzx  r1d, byte [r0+r2*2-1]
2404
    add    r3d, r1d
2405
    add    r3d, 4
2406
    shr    r3d, 3
2407
    imul   r3d, 0x01010101
2408
    mov   [r4+r2*0], r3d
2409
    mov   [r0+r2*0], r3d
2410
    mov   [r0+r2*1], r3d
2411
    mov   [r0+r2*2], r3d
2412
    RET
2413

    
2414
;-----------------------------------------------------------------------------
2415
; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2416
;-----------------------------------------------------------------------------
2417

    
2418
%macro PRED4x4_TM_MMX 1
2419
cglobal pred4x4_tm_vp8_%1, 3,6
2420
    sub        r0, r2
2421
    pxor      mm7, mm7
2422
    movd      mm0, [r0]
2423
    punpcklbw mm0, mm7
2424
    movzx     r4d, byte [r0-1]
2425
    mov       r5d, 2
2426
.loop:
2427
    movzx     r1d, byte [r0+r2*1-1]
2428
    movzx     r3d, byte [r0+r2*2-1]
2429
    sub       r1d, r4d
2430
    sub       r3d, r4d
2431
    movd      mm2, r1d
2432
    movd      mm4, r3d
2433
%ifidn %1, mmx
2434
    punpcklwd mm2, mm2
2435
    punpcklwd mm4, mm4
2436
    punpckldq mm2, mm2
2437
    punpckldq mm4, mm4
2438
%else
2439
    pshufw    mm2, mm2, 0
2440
    pshufw    mm4, mm4, 0
2441
%endif
2442
    paddw     mm2, mm0
2443
    paddw     mm4, mm0
2444
    packuswb  mm2, mm2
2445
    packuswb  mm4, mm4
2446
    movd [r0+r2*1], mm2
2447
    movd [r0+r2*2], mm4
2448
    lea        r0, [r0+r2*2]
2449
    dec       r5d
2450
    jg .loop
2451
    REP_RET
2452
%endmacro
2453

    
2454
PRED4x4_TM_MMX mmx
2455
PRED4x4_TM_MMX mmxext
2456

    
2457
cglobal pred4x4_tm_vp8_ssse3, 3,3
2458
    sub         r0, r2
2459
    movq       mm6, [tm_shuf]
2460
    pxor       mm1, mm1
2461
    movd       mm0, [r0]
2462
    punpcklbw  mm0, mm1
2463
    movd       mm7, [r0-4]
2464
    pshufb     mm7, mm6
2465
    lea         r1, [r0+r2*2]
2466
    movd       mm2, [r0+r2*1-4]
2467
    movd       mm3, [r0+r2*2-4]
2468
    movd       mm4, [r1+r2*1-4]
2469
    movd       mm5, [r1+r2*2-4]
2470
    pshufb     mm2, mm6
2471
    pshufb     mm3, mm6
2472
    pshufb     mm4, mm6
2473
    pshufb     mm5, mm6
2474
    psubw      mm2, mm7
2475
    psubw      mm3, mm7
2476
    psubw      mm4, mm7
2477
    psubw      mm5, mm7
2478
    paddw      mm2, mm0
2479
    paddw      mm3, mm0
2480
    paddw      mm4, mm0
2481
    paddw      mm5, mm0
2482
    packuswb   mm2, mm2
2483
    packuswb   mm3, mm3
2484
    packuswb   mm4, mm4
2485
    packuswb   mm5, mm5
2486
    movd [r0+r2*1], mm2
2487
    movd [r0+r2*2], mm3
2488
    movd [r1+r2*1], mm4
2489
    movd [r1+r2*2], mm5
2490
    RET
2491

    
2492
;-----------------------------------------------------------------------------
2493
; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2494
;-----------------------------------------------------------------------------
2495

    
2496
INIT_MMX
2497
cglobal pred4x4_vertical_vp8_mmxext, 3,3
2498
    sub       r0, r2
2499
    movd      m1, [r0-1]
2500
    movd      m0, [r0]
2501
    mova      m2, m0   ;t0 t1 t2 t3
2502
    punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2503
    lea       r1, [r0+r2*2]
2504
    psrlq     m0, 8    ;t1 t2 t3 t4
2505
    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2506
    movd [r0+r2*1], m3
2507
    movd [r0+r2*2], m3
2508
    movd [r1+r2*1], m3
2509
    movd [r1+r2*2], m3
2510
    RET
2511

    
2512
;-----------------------------------------------------------------------------
2513
; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2514
;-----------------------------------------------------------------------------
2515
%ifdef CONFIG_GPL
2516
INIT_MMX
2517
cglobal pred4x4_down_left_mmxext, 3,3
2518
    sub       r0, r2
2519
    movq      m1, [r0]
2520
    punpckldq m1, [r1]
2521
    movq      m2, m1
2522
    movq      m3, m1
2523
    movq      m4, m1
2524
    psllq     m1, 8
2525
    pxor      m2, m1
2526
    psrlq     m2, 8
2527
    pxor      m3, m2
2528
    PRED4x4_LOWPASS m0, m1, m3, m4, m5
2529
    lea       r1, [r0+r2*2]
2530
    psrlq     m0, 8
2531
    movd      [r0+r2*1], m0
2532
    psrlq     m0, 8
2533
    movd      [r0+r2*2], m0
2534
    psrlq     m0, 8
2535
    movd      [r1+r2*1], m0
2536
    psrlq     m0, 8
2537
    movd      [r1+r2*2], m0
2538
    RET
2539

    
2540
;-----------------------------------------------------------------------------
2541
; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2542
;-----------------------------------------------------------------------------
2543

    
2544
INIT_MMX
2545
cglobal pred4x4_vertical_left_mmxext, 3,3
2546
    sub       r0, r2
2547
    movq      m1, [r0]
2548
    punpckldq m1, [r1]
2549
    movq      m3, m1
2550
    movq      m2, m1
2551
    psrlq     m3, 8
2552
    psrlq     m2, 16
2553
    movq      m4, m3
2554
    pavgb     m4, m1
2555
    PRED4x4_LOWPASS m0, m1, m2, m3, m5
2556
    lea       r1, [r0+r2*2]
2557
    movh      [r0+r2*1], m4
2558
    movh      [r0+r2*2], m0
2559
    psrlq     m4, 8
2560
    psrlq     m0, 8
2561
    movh      [r1+r2*1], m4
2562
    movh      [r1+r2*2], m0
2563
    RET
2564
%endif