Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_intrapred.asm @ 1b3e43e4

History | View | Annotate | Download (70.7 KB)

1
;******************************************************************************
2
;* H.264 intra prediction asm optimizations
3
;* Copyright (c) 2010 Jason Garrett-Glaser
4
;* Copyright (c) 2010 Holger Lubitz
5
;* Copyright (c) 2010 Loren Merritt
6
;* Copyright (c) 2010 Ronald S. Bultje
7
;*
8
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or
11
;* modify it under the terms of the GNU Lesser General Public
12
;* License as published by the Free Software Foundation; either
13
;* version 2.1 of the License, or (at your option) any later version.
14
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
;* Lesser General Public License for more details.
19
;*
20
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with FFmpeg; if not, write to the Free Software
22
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
;******************************************************************************
24

    
25
%include "x86inc.asm"
26
%include "x86util.asm"
27

    
28
SECTION_RODATA
29

    
30
tm_shuf: times 8 db 0x03, 0x80
31
pw_ff00: times 8 dw 0xff00
32
plane_shuf:  db -8, -7, -6, -5, -4, -3, -2, -1
33
             db  1,  2,  3,  4,  5,  6,  7,  8
34
plane8_shuf: db -4, -3, -2, -1,  0,  0,  0,  0
35
             db  1,  2,  3,  4,  0,  0,  0,  0
36
pw_0to7:     dw  0,  1,  2,  3,  4,  5,  6,  7
37
pw_1to8:     dw  1,  2,  3,  4,  5,  6,  7,  8
38
pw_m8tom1:   dw -8, -7, -6, -5, -4, -3, -2, -1
39
pw_m4to4:    dw -4, -3, -2, -1,  1,  2,  3,  4
40

    
41
SECTION .text
42

    
43
cextern pb_1
44
cextern pb_3
45
cextern pw_4
46
cextern pw_5
47
cextern pw_8
48
cextern pw_16
49
cextern pw_17
50
cextern pw_32
51

    
52
;-----------------------------------------------------------------------------
53
; void pred16x16_vertical(uint8_t *src, int stride)
54
;-----------------------------------------------------------------------------
55

    
56
cglobal pred16x16_vertical_mmx, 2,3
57
    sub   r0, r1
58
    mov   r2, 8
59
    movq mm0, [r0+0]
60
    movq mm1, [r0+8]
61
.loop:
62
    movq [r0+r1*1+0], mm0
63
    movq [r0+r1*1+8], mm1
64
    movq [r0+r1*2+0], mm0
65
    movq [r0+r1*2+8], mm1
66
    lea   r0, [r0+r1*2]
67
    dec   r2
68
    jg .loop
69
    REP_RET
70

    
71
cglobal pred16x16_vertical_sse, 2,3
72
    sub   r0, r1
73
    mov   r2, 4
74
    movaps xmm0, [r0]
75
.loop:
76
    movaps [r0+r1*1], xmm0
77
    movaps [r0+r1*2], xmm0
78
    lea   r0, [r0+r1*2]
79
    movaps [r0+r1*1], xmm0
80
    movaps [r0+r1*2], xmm0
81
    lea   r0, [r0+r1*2]
82
    dec   r2
83
    jg .loop
84
    REP_RET
85

    
86
;-----------------------------------------------------------------------------
87
; void pred16x16_horizontal(uint8_t *src, int stride)
88
;-----------------------------------------------------------------------------
89

    
90
%macro PRED16x16_H 1
91
cglobal pred16x16_horizontal_%1, 2,3
92
    mov       r2, 8
93
%ifidn %1, ssse3
94
    mova      m2, [pb_3]
95
%endif
96
.loop:
97
    movd      m0, [r0+r1*0-4]
98
    movd      m1, [r0+r1*1-4]
99

    
100
%ifidn %1, ssse3
101
    pshufb    m0, m2
102
    pshufb    m1, m2
103
%else
104
    punpcklbw m0, m0
105
    punpcklbw m1, m1
106
%ifidn %1, mmxext
107
    pshufw    m0, m0, 0xff
108
    pshufw    m1, m1, 0xff
109
%else
110
    punpckhwd m0, m0
111
    punpckhwd m1, m1
112
    punpckhdq m0, m0
113
    punpckhdq m1, m1
114
%endif
115
    mova [r0+r1*0+8], m0
116
    mova [r0+r1*1+8], m1
117
%endif
118

    
119
    mova [r0+r1*0], m0
120
    mova [r0+r1*1], m1
121
    lea       r0, [r0+r1*2]
122
    dec       r2
123
    jg .loop
124
    REP_RET
125
%endmacro
126

    
127
INIT_MMX
128
PRED16x16_H mmx
129
PRED16x16_H mmxext
130
INIT_XMM
131
PRED16x16_H ssse3
132

    
133
;-----------------------------------------------------------------------------
134
; void pred16x16_dc(uint8_t *src, int stride)
135
;-----------------------------------------------------------------------------
136

    
137
%macro PRED16x16_DC 1
138
cglobal pred16x16_dc_%1, 2,7
139
    mov       r4, r0
140
    sub       r0, r1
141
    pxor      mm0, mm0
142
    pxor      mm1, mm1
143
    psadbw    mm0, [r0+0]
144
    psadbw    mm1, [r0+8]
145
    dec        r0
146
    movzx     r5d, byte [r0+r1*1]
147
    paddw     mm0, mm1
148
    movd      r6d, mm0
149
    lea        r0, [r0+r1*2]
150
%rep 7
151
    movzx     r2d, byte [r0+r1*0]
152
    movzx     r3d, byte [r0+r1*1]
153
    add       r5d, r2d
154
    add       r6d, r3d
155
    lea        r0, [r0+r1*2]
156
%endrep
157
    movzx     r2d, byte [r0+r1*0]
158
    add       r5d, r6d
159
    lea       r2d, [r2+r5+16]
160
    shr       r2d, 5
161
%ifidn %1, mmxext
162
    movd       m0, r2d
163
    punpcklbw  m0, m0
164
    pshufw     m0, m0, 0
165
%elifidn %1, sse2
166
    movd       m0, r2d
167
    punpcklbw  m0, m0
168
    pshuflw    m0, m0, 0
169
    punpcklqdq m0, m0
170
%elifidn %1, ssse3
171
    pxor       m1, m1
172
    movd       m0, r2d
173
    pshufb     m0, m1
174
%endif
175

    
176
%if mmsize==8
177
    mov       r3d, 8
178
.loop:
179
    mova [r4+r1*0+0], m0
180
    mova [r4+r1*0+8], m0
181
    mova [r4+r1*1+0], m0
182
    mova [r4+r1*1+8], m0
183
%else
184
    mov       r3d, 4
185
.loop:
186
    mova [r4+r1*0], m0
187
    mova [r4+r1*1], m0
188
    lea   r4, [r4+r1*2]
189
    mova [r4+r1*0], m0
190
    mova [r4+r1*1], m0
191
%endif
192
    lea   r4, [r4+r1*2]
193
    dec   r3d
194
    jg .loop
195
    REP_RET
196
%endmacro
197

    
198
INIT_MMX
199
PRED16x16_DC mmxext
200
INIT_XMM
201
PRED16x16_DC   sse2
202
PRED16x16_DC  ssse3
203

    
204
;-----------------------------------------------------------------------------
205
; void pred16x16_tm_vp8(uint8_t *src, int stride)
206
;-----------------------------------------------------------------------------
207

    
208
%macro PRED16x16_TM_MMX 1
209
cglobal pred16x16_tm_vp8_%1, 2,5
210
    sub        r0, r1
211
    pxor      mm7, mm7
212
    movq      mm0, [r0+0]
213
    movq      mm2, [r0+8]
214
    movq      mm1, mm0
215
    movq      mm3, mm2
216
    punpcklbw mm0, mm7
217
    punpckhbw mm1, mm7
218
    punpcklbw mm2, mm7
219
    punpckhbw mm3, mm7
220
    movzx     r3d, byte [r0-1]
221
    mov       r4d, 16
222
.loop:
223
    movzx     r2d, byte [r0+r1-1]
224
    sub       r2d, r3d
225
    movd      mm4, r2d
226
%ifidn %1, mmx
227
    punpcklwd mm4, mm4
228
    punpckldq mm4, mm4
229
%else
230
    pshufw    mm4, mm4, 0
231
%endif
232
    movq      mm5, mm4
233
    movq      mm6, mm4
234
    movq      mm7, mm4
235
    paddw     mm4, mm0
236
    paddw     mm5, mm1
237
    paddw     mm6, mm2
238
    paddw     mm7, mm3
239
    packuswb  mm4, mm5
240
    packuswb  mm6, mm7
241
    movq [r0+r1+0], mm4
242
    movq [r0+r1+8], mm6
243
    add        r0, r1
244
    dec       r4d
245
    jg .loop
246
    REP_RET
247
%endmacro
248

    
249
PRED16x16_TM_MMX mmx
250
PRED16x16_TM_MMX mmxext
251

    
252
cglobal pred16x16_tm_vp8_sse2, 2,6,6
253
    sub          r0, r1
254
    pxor       xmm2, xmm2
255
    movdqa     xmm0, [r0]
256
    movdqa     xmm1, xmm0
257
    punpcklbw  xmm0, xmm2
258
    punpckhbw  xmm1, xmm2
259
    movzx       r4d, byte [r0-1]
260
    mov         r5d, 8
261
.loop:
262
    movzx       r2d, byte [r0+r1*1-1]
263
    movzx       r3d, byte [r0+r1*2-1]
264
    sub         r2d, r4d
265
    sub         r3d, r4d
266
    movd       xmm2, r2d
267
    movd       xmm4, r3d
268
    pshuflw    xmm2, xmm2, 0
269
    pshuflw    xmm4, xmm4, 0
270
    punpcklqdq xmm2, xmm2
271
    punpcklqdq xmm4, xmm4
272
    movdqa     xmm3, xmm2
273
    movdqa     xmm5, xmm4
274
    paddw      xmm2, xmm0
275
    paddw      xmm3, xmm1
276
    paddw      xmm4, xmm0
277
    paddw      xmm5, xmm1
278
    packuswb   xmm2, xmm3
279
    packuswb   xmm4, xmm5
280
    movdqa [r0+r1*1], xmm2
281
    movdqa [r0+r1*2], xmm4
282
    lea          r0, [r0+r1*2]
283
    dec         r5d
284
    jg .loop
285
    REP_RET
286

    
287
;-----------------------------------------------------------------------------
288
; void pred16x16_plane(uint8_t *src, int stride)
289
;-----------------------------------------------------------------------------
290

    
291
%macro H264_PRED16x16_PLANE 3
292
cglobal pred16x16_plane_%3_%1, 2, 7, %2
293
    mov          r2, r1           ; +stride
294
    neg          r1               ; -stride
295

    
296
    movh         m0, [r0+r1  -1]
297
%if mmsize == 8
298
    pxor         m4, m4
299
    movh         m1, [r0+r1  +3 ]
300
    movh         m2, [r0+r1  +8 ]
301
    movh         m3, [r0+r1  +12]
302
    punpcklbw    m0, m4
303
    punpcklbw    m1, m4
304
    punpcklbw    m2, m4
305
    punpcklbw    m3, m4
306
    pmullw       m0, [pw_m8tom1  ]
307
    pmullw       m1, [pw_m8tom1+8]
308
    pmullw       m2, [pw_1to8    ]
309
    pmullw       m3, [pw_1to8  +8]
310
    paddw        m0, m2
311
    paddw        m1, m3
312
%else ; mmsize == 16
313
%ifidn %1, sse2
314
    pxor         m2, m2
315
    movh         m1, [r0+r1  +8]
316
    punpcklbw    m0, m2
317
    punpcklbw    m1, m2
318
    pmullw       m0, [pw_m8tom1]
319
    pmullw       m1, [pw_1to8]
320
    paddw        m0, m1
321
%else ; ssse3
322
    movhps       m0, [r0+r1  +8]
323
    pmaddubsw    m0, [plane_shuf] ; H coefficients
324
%endif
325
    movhlps      m1, m0
326
%endif
327
    paddw        m0, m1
328
%ifidn %1, mmx
329
    mova         m1, m0
330
    psrlq        m1, 32
331
%elifidn %1, mmx2
332
    pshufw       m1, m0, 0xE
333
%else ; mmsize == 16
334
    pshuflw      m1, m0, 0xE
335
%endif
336
    paddw        m0, m1
337
%ifidn %1, mmx
338
    mova         m1, m0
339
    psrlq        m1, 16
340
%elifidn %1, mmx2
341
    pshufw       m1, m0, 0x1
342
%else
343
    pshuflw      m1, m0, 0x1
344
%endif
345
    paddw        m0, m1           ; sum of H coefficients
346

    
347
    lea          r4, [r0+r2*8-1]
348
    lea          r3, [r0+r2*4-1]
349
    add          r4, r2
350

    
351
%ifdef ARCH_X86_64
352
%define e_reg r11
353
%else
354
%define e_reg r0
355
%endif
356

    
357
    movzx     e_reg, byte [r3+r2*2   ]
358
    movzx        r5, byte [r4+r1     ]
359
    sub          r5, e_reg
360

    
361
    movzx     e_reg, byte [r3+r2     ]
362
    movzx        r6, byte [r4        ]
363
    sub          r6, e_reg
364
    lea          r5, [r5+r6*2]
365

    
366
    movzx     e_reg, byte [r3+r1     ]
367
    movzx        r6, byte [r4+r2*2   ]
368
    sub          r6, e_reg
369
    lea          r5, [r5+r6*4]
370

    
371
    movzx     e_reg, byte [r3        ]
372
%ifdef ARCH_X86_64
373
    movzx       r10, byte [r4+r2     ]
374
    sub         r10, e_reg
375
%else
376
    movzx        r6, byte [r4+r2     ]
377
    sub          r6, e_reg
378
    lea          r5, [r5+r6*4]
379
    sub          r5, r6
380
%endif
381

    
382
    lea       e_reg, [r3+r1*4]
383
    lea          r3, [r4+r2*4]
384

    
385
    movzx        r4, byte [e_reg+r2  ]
386
    movzx        r6, byte [r3        ]
387
    sub          r6, r4
388
%ifdef ARCH_X86_64
389
    lea          r6, [r10+r6*2]
390
    lea          r5, [r5+r6*2]
391
    add          r5, r6
392
%else
393
    lea          r5, [r5+r6*4]
394
    lea          r5, [r5+r6*2]
395
%endif
396

    
397
    movzx        r4, byte [e_reg     ]
398
%ifdef ARCH_X86_64
399
    movzx       r10, byte [r3   +r2  ]
400
    sub         r10, r4
401
    sub          r5, r10
402
%else
403
    movzx        r6, byte [r3   +r2  ]
404
    sub          r6, r4
405
    lea          r5, [r5+r6*8]
406
    sub          r5, r6
407
%endif
408

    
409
    movzx        r4, byte [e_reg+r1  ]
410
    movzx        r6, byte [r3   +r2*2]
411
    sub          r6, r4
412
%ifdef ARCH_X86_64
413
    add          r6, r10
414
%endif
415
    lea          r5, [r5+r6*8]
416

    
417
    movzx        r4, byte [e_reg+r2*2]
418
    movzx        r6, byte [r3   +r1  ]
419
    sub          r6, r4
420
    lea          r5, [r5+r6*4]
421
    add          r5, r6           ; sum of V coefficients
422

    
423
%ifndef ARCH_X86_64
424
    mov          r0, r0m
425
%endif
426

    
427
%ifidn %3, h264
428
    lea          r5, [r5*5+32]
429
    sar          r5, 6
430
%elifidn %3, rv40
431
    lea          r5, [r5*5]
432
    sar          r5, 6
433
%elifidn %3, svq3
434
    test         r5, r5
435
    lea          r6, [r5+3]
436
    cmovs        r5, r6
437
    sar          r5, 2            ; V/4
438
    lea          r5, [r5*5]       ; 5*(V/4)
439
    test         r5, r5
440
    lea          r6, [r5+15]
441
    cmovs        r5, r6
442
    sar          r5, 4            ; (5*(V/4))/16
443
%endif
444

    
445
    movzx        r4, byte [r0+r1  +15]
446
    movzx        r3, byte [r3+r2*2   ]
447
    lea          r3, [r3+r4+1]
448
    shl          r3, 4
449

    
450
    movd        r1d, m0
451
    movsx       r1d, r1w
452
%ifnidn %3, svq3
453
%ifidn %3, h264
454
    lea         r1d, [r1d*5+32]
455
%else ; rv40
456
    lea         r1d, [r1d*5]
457
%endif
458
    sar         r1d, 6
459
%else ; svq3
460
    test        r1d, r1d
461
    lea         r4d, [r1d+3]
462
    cmovs       r1d, r4d
463
    sar         r1d, 2           ; H/4
464
    lea         r1d, [r1d*5]     ; 5*(H/4)
465
    test        r1d, r1d
466
    lea         r4d, [r1d+15]
467
    cmovs       r1d, r4d
468
    sar         r1d, 4           ; (5*(H/4))/16
469
%endif
470
    movd         m0, r1d
471

    
472
    add         r1d, r5d
473
    add         r3d, r1d
474
    shl         r1d, 3
475
    sub         r3d, r1d          ; a
476

    
477
    movd         m1, r5d
478
    movd         m3, r3d
479
%ifidn %1, mmx
480
    punpcklwd    m0, m0
481
    punpcklwd    m1, m1
482
    punpcklwd    m3, m3
483
    punpckldq    m0, m0
484
    punpckldq    m1, m1
485
    punpckldq    m3, m3
486
%elifidn %1, mmx2
487
    pshufw       m0, m0, 0x0
488
    pshufw       m1, m1, 0x0
489
    pshufw       m3, m3, 0x0
490
%else
491
    pshuflw      m0, m0, 0x0
492
    pshuflw      m1, m1, 0x0
493
    pshuflw      m3, m3, 0x0
494
    punpcklqdq   m0, m0           ; splat H (words)
495
    punpcklqdq   m1, m1           ; splat V (words)
496
    punpcklqdq   m3, m3           ; splat a (words)
497
%endif
498
%ifidn %3, svq3
499
    SWAP          0, 1
500
%endif
501
    mova         m2, m0
502
%if mmsize == 8
503
    mova         m5, m0
504
%endif
505
    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
506
%if mmsize == 16
507
    psllw        m2, 3
508
%else
509
    psllw        m5, 3
510
    psllw        m2, 2
511
    mova         m6, m5
512
    paddw        m6, m2
513
%endif
514
    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
515
    paddw        m2, m0           ; a + {8,9,10,11,12,13,14,15}*H
516
%if mmsize == 8
517
    paddw        m5, m0           ; a + {8,9,10,11}*H
518
    paddw        m6, m0           ; a + {12,13,14,15}*H
519
%endif
520

    
521
    mov          r4, 8
522
.loop
523
    mova         m3, m0           ; b[0..7]
524
    mova         m4, m2           ; b[8..15]
525
    psraw        m3, 5
526
    psraw        m4, 5
527
    packuswb     m3, m4
528
    mova       [r0], m3
529
%if mmsize == 8
530
    mova         m3, m5           ; b[8..11]
531
    mova         m4, m6           ; b[12..15]
532
    psraw        m3, 5
533
    psraw        m4, 5
534
    packuswb     m3, m4
535
    mova     [r0+8], m3
536
%endif
537
    paddw        m0, m1
538
    paddw        m2, m1
539
%if mmsize == 8
540
    paddw        m5, m1
541
    paddw        m6, m1
542
%endif
543

    
544
    mova         m3, m0           ; b[0..7]
545
    mova         m4, m2           ; b[8..15]
546
    psraw        m3, 5
547
    psraw        m4, 5
548
    packuswb     m3, m4
549
    mova    [r0+r2], m3
550
%if mmsize == 8
551
    mova         m3, m5           ; b[8..11]
552
    mova         m4, m6           ; b[12..15]
553
    psraw        m3, 5
554
    psraw        m4, 5
555
    packuswb     m3, m4
556
    mova  [r0+r2+8], m3
557
%endif
558
    paddw        m0, m1
559
    paddw        m2, m1
560
%if mmsize == 8
561
    paddw        m5, m1
562
    paddw        m6, m1
563
%endif
564

    
565
    lea          r0, [r0+r2*2]
566
    dec          r4
567
    jg .loop
568
    REP_RET
569
%endmacro
570

    
571
INIT_MMX
572
H264_PRED16x16_PLANE mmx,   0, h264
573
H264_PRED16x16_PLANE mmx,   0, rv40
574
H264_PRED16x16_PLANE mmx,   0, svq3
575
H264_PRED16x16_PLANE mmx2,  0, h264
576
H264_PRED16x16_PLANE mmx2,  0, rv40
577
H264_PRED16x16_PLANE mmx2,  0, svq3
578
INIT_XMM
579
H264_PRED16x16_PLANE sse2,  8, h264
580
H264_PRED16x16_PLANE sse2,  8, rv40
581
H264_PRED16x16_PLANE sse2,  8, svq3
582
H264_PRED16x16_PLANE ssse3, 8, h264
583
H264_PRED16x16_PLANE ssse3, 8, rv40
584
H264_PRED16x16_PLANE ssse3, 8, svq3
585

    
586
;-----------------------------------------------------------------------------
587
; void pred8x8_plane(uint8_t *src, int stride)
588
;-----------------------------------------------------------------------------
589

    
590
%macro H264_PRED8x8_PLANE 2
591
cglobal pred8x8_plane_%1, 2, 7, %2
592
    mov          r2, r1           ; +stride
593
    neg          r1               ; -stride
594

    
595
    movd         m0, [r0+r1  -1]
596
%if mmsize == 8
597
    pxor         m2, m2
598
    movh         m1, [r0+r1  +4 ]
599
    punpcklbw    m0, m2
600
    punpcklbw    m1, m2
601
    pmullw       m0, [pw_m4to4]
602
    pmullw       m1, [pw_m4to4+8]
603
%else ; mmsize == 16
604
%ifidn %1, sse2
605
    pxor         m2, m2
606
    movd         m1, [r0+r1  +4]
607
    punpckldq    m0, m1
608
    punpcklbw    m0, m2
609
    pmullw       m0, [pw_m4to4]
610
%else ; ssse3
611
    movhps       m0, [r0+r1  +4]   ; this reads 4 bytes more than necessary
612
    pmaddubsw    m0, [plane8_shuf] ; H coefficients
613
%endif
614
    movhlps      m1, m0
615
%endif
616
    paddw        m0, m1
617

    
618
%ifnidn %1, ssse3
619
%ifidn %1, mmx
620
    mova         m1, m0
621
    psrlq        m1, 32
622
%elifidn %1, mmx2
623
    pshufw       m1, m0, 0xE
624
%else ; mmsize == 16
625
    pshuflw      m1, m0, 0xE
626
%endif
627
    paddw        m0, m1
628
%endif ; !ssse3
629

    
630
%ifidn %1, mmx
631
    mova         m1, m0
632
    psrlq        m1, 16
633
%elifidn %1, mmx2
634
    pshufw       m1, m0, 0x1
635
%else
636
    pshuflw      m1, m0, 0x1
637
%endif
638
    paddw        m0, m1           ; sum of H coefficients
639

    
640
    pmullw       m0, [pw_17]
641
    paddw        m0, [pw_16]
642
    psraw        m0, 5
643

    
644
    lea          r4, [r0+r2*4-1]
645
    lea          r3, [r0     -1]
646
    add          r4, r2
647

    
648
%ifdef ARCH_X86_64
649
%define e_reg r11
650
%else
651
%define e_reg r0
652
%endif
653

    
654
    movzx     e_reg, byte [r3+r2*2   ]
655
    movzx        r5, byte [r4+r1     ]
656
    sub          r5, e_reg
657

    
658
    movzx     e_reg, byte [r3        ]
659
%ifdef ARCH_X86_64
660
    movzx       r10, byte [r4+r2     ]
661
    sub         r10, e_reg
662
    sub          r5, r10
663
%else
664
    movzx        r6, byte [r4+r2     ]
665
    sub          r6, e_reg
666
    lea          r5, [r5+r6*4]
667
    sub          r5, r6
668
%endif
669

    
670
    movzx     e_reg, byte [r3+r1     ]
671
    movzx        r6, byte [r4+r2*2   ]
672
    sub          r6, e_reg
673
%ifdef ARCH_X86_64
674
    add          r6, r10
675
%endif
676
    lea          r5, [r5+r6*4]
677

    
678
    movzx     e_reg, byte [r3+r2     ]
679
    movzx        r6, byte [r4        ]
680
    sub          r6, e_reg
681
    lea          r6, [r5+r6*2]
682

    
683
    lea          r5, [r6*9+16]
684
    lea          r5, [r5+r6*8]
685
    sar          r5, 5
686

    
687
%ifndef ARCH_X86_64
688
    mov          r0, r0m
689
%endif
690

    
691
    movzx        r3, byte [r4+r2*2  ]
692
    movzx        r4, byte [r0+r1  +7]
693
    lea          r3, [r3+r4+1]
694
    shl          r3, 4
695
    movd        r1d, m0
696
    movsx       r1d, r1w
697
    add         r1d, r5d
698
    sub         r3d, r1d
699
    add         r1d, r1d
700
    sub         r3d, r1d          ; a
701

    
702
    movd         m1, r5d
703
    movd         m3, r3d
704
%ifidn %1, mmx
705
    punpcklwd    m0, m0
706
    punpcklwd    m1, m1
707
    punpcklwd    m3, m3
708
    punpckldq    m0, m0
709
    punpckldq    m1, m1
710
    punpckldq    m3, m3
711
%elifidn %1, mmx2
712
    pshufw       m0, m0, 0x0
713
    pshufw       m1, m1, 0x0
714
    pshufw       m3, m3, 0x0
715
%else
716
    pshuflw      m0, m0, 0x0
717
    pshuflw      m1, m1, 0x0
718
    pshuflw      m3, m3, 0x0
719
    punpcklqdq   m0, m0           ; splat H (words)
720
    punpcklqdq   m1, m1           ; splat V (words)
721
    punpcklqdq   m3, m3           ; splat a (words)
722
%endif
723
%if mmsize == 8
724
    mova         m2, m0
725
%endif
726
    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
727
    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
728
%if mmsize == 8
729
    psllw        m2, 2
730
    paddw        m2, m0           ; a + {4,5,6,7}*H
731
%endif
732

    
733
    mov          r4, 4
734
ALIGN 16
735
.loop
736
%if mmsize == 16
737
    mova         m3, m0           ; b[0..7]
738
    paddw        m0, m1
739
    psraw        m3, 5
740
    mova         m4, m0           ; V+b[0..7]
741
    paddw        m0, m1
742
    psraw        m4, 5
743
    packuswb     m3, m4
744
    movh       [r0], m3
745
    movhps  [r0+r2], m3
746
%else ; mmsize == 8
747
    mova         m3, m0           ; b[0..3]
748
    mova         m4, m2           ; b[4..7]
749
    paddw        m0, m1
750
    paddw        m2, m1
751
    psraw        m3, 5
752
    psraw        m4, 5
753
    mova         m5, m0           ; V+b[0..3]
754
    mova         m6, m2           ; V+b[4..7]
755
    paddw        m0, m1
756
    paddw        m2, m1
757
    psraw        m5, 5
758
    psraw        m6, 5
759
    packuswb     m3, m4
760
    packuswb     m5, m6
761
    mova       [r0], m3
762
    mova    [r0+r2], m5
763
%endif
764

    
765
    lea          r0, [r0+r2*2]
766
    dec          r4
767
    jg .loop
768
    REP_RET
769
%endmacro
770

    
771
INIT_MMX
772
H264_PRED8x8_PLANE mmx,   0
773
H264_PRED8x8_PLANE mmx2,  0
774
INIT_XMM
775
H264_PRED8x8_PLANE sse2,  8
776
H264_PRED8x8_PLANE ssse3, 8
777

    
778
;-----------------------------------------------------------------------------
779
; void pred8x8_vertical(uint8_t *src, int stride)
780
;-----------------------------------------------------------------------------
781

    
782
cglobal pred8x8_vertical_mmx, 2,2
783
    sub    r0, r1
784
    movq  mm0, [r0]
785
%rep 3
786
    movq [r0+r1*1], mm0
787
    movq [r0+r1*2], mm0
788
    lea    r0, [r0+r1*2]
789
%endrep
790
    movq [r0+r1*1], mm0
791
    movq [r0+r1*2], mm0
792
    RET
793

    
794
;-----------------------------------------------------------------------------
795
; void pred8x8_horizontal(uint8_t *src, int stride)
796
;-----------------------------------------------------------------------------
797

    
798
%macro PRED8x8_H 1
799
cglobal pred8x8_horizontal_%1, 2,3
800
    mov       r2, 4
801
%ifidn %1, ssse3
802
    mova      m2, [pb_3]
803
%endif
804
.loop:
805
    movd      m0, [r0+r1*0-4]
806
    movd      m1, [r0+r1*1-4]
807
%ifidn %1, ssse3
808
    pshufb    m0, m2
809
    pshufb    m1, m2
810
%else
811
    punpcklbw m0, m0
812
    punpcklbw m1, m1
813
%ifidn %1, mmxext
814
    pshufw    m0, m0, 0xff
815
    pshufw    m1, m1, 0xff
816
%else
817
    punpckhwd m0, m0
818
    punpckhwd m1, m1
819
    punpckhdq m0, m0
820
    punpckhdq m1, m1
821
%endif
822
%endif
823
    mova [r0+r1*0], m0
824
    mova [r0+r1*1], m1
825
    lea       r0, [r0+r1*2]
826
    dec       r2
827
    jg .loop
828
    REP_RET
829
%endmacro
830

    
831
INIT_MMX
832
PRED8x8_H mmx
833
PRED8x8_H mmxext
834
PRED8x8_H ssse3
835

    
836
;-----------------------------------------------------------------------------
837
; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
838
;-----------------------------------------------------------------------------
839
%ifdef CONFIG_GPL
840
cglobal pred8x8_top_dc_mmxext, 2,5
841
    sub         r0, r1
842
    movq       mm0, [r0]
843
    pxor       mm1, mm1
844
    pxor       mm2, mm2
845
    lea         r2, [r0+r1*2]
846
    punpckhbw  mm1, mm0
847
    punpcklbw  mm0, mm2
848
    psadbw     mm1, mm2        ; s1
849
    lea         r3, [r2+r1*2]
850
    psadbw     mm0, mm2        ; s0
851
    psrlw      mm1, 1
852
    psrlw      mm0, 1
853
    pavgw      mm1, mm2
854
    lea         r4, [r3+r1*2]
855
    pavgw      mm0, mm2
856
    pshufw     mm1, mm1, 0
857
    pshufw     mm0, mm0, 0     ; dc0 (w)
858
    packuswb   mm0, mm1        ; dc0,dc1 (b)
859
    movq [r0+r1*1], mm0
860
    movq [r0+r1*2], mm0
861
    lea         r0, [r3+r1*2]
862
    movq [r2+r1*1], mm0
863
    movq [r2+r1*2], mm0
864
    movq [r3+r1*1], mm0
865
    movq [r3+r1*2], mm0
866
    movq [r0+r1*1], mm0
867
    movq [r0+r1*2], mm0
868
    RET
869

    
870
;-----------------------------------------------------------------------------
871
; void pred8x8_dc_mmxext(uint8_t *src, int stride)
872
;-----------------------------------------------------------------------------
873

    
874
INIT_MMX
875
cglobal pred8x8_dc_mmxext, 2,5
876
    sub       r0, r1
877
    pxor      m7, m7
878
    movd      m0, [r0+0]
879
    movd      m1, [r0+4]
880
    psadbw    m0, m7            ; s0
881
    mov       r4, r0
882
    psadbw    m1, m7            ; s1
883

    
884
    movzx    r2d, byte [r0+r1*1-1]
885
    movzx    r3d, byte [r0+r1*2-1]
886
    lea       r0, [r0+r1*2]
887
    add      r2d, r3d
888
    movzx    r3d, byte [r0+r1*1-1]
889
    add      r2d, r3d
890
    movzx    r3d, byte [r0+r1*2-1]
891
    add      r2d, r3d
892
    lea       r0, [r0+r1*2]
893
    movd      m2, r2d            ; s2
894
    movzx    r2d, byte [r0+r1*1-1]
895
    movzx    r3d, byte [r0+r1*2-1]
896
    lea       r0, [r0+r1*2]
897
    add      r2d, r3d
898
    movzx    r3d, byte [r0+r1*1-1]
899
    add      r2d, r3d
900
    movzx    r3d, byte [r0+r1*2-1]
901
    add      r2d, r3d
902
    movd      m3, r2d            ; s3
903

    
904
    punpcklwd m0, m1
905
    mov       r0, r4
906
    punpcklwd m2, m3
907
    punpckldq m0, m2            ; s0, s1, s2, s3
908
    pshufw    m3, m0, 11110110b ; s2, s1, s3, s3
909
    lea       r2, [r0+r1*2]
910
    pshufw    m0, m0, 01110100b ; s0, s1, s3, s1
911
    paddw     m0, m3
912
    lea       r3, [r2+r1*2]
913
    psrlw     m0, 2
914
    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
915
    lea       r4, [r3+r1*2]
916
    packuswb  m0, m0
917
    punpcklbw m0, m0
918
    movq      m1, m0
919
    punpcklbw m0, m0
920
    punpckhbw m1, m1
921
    movq [r0+r1*1], m0
922
    movq [r0+r1*2], m0
923
    movq [r2+r1*1], m0
924
    movq [r2+r1*2], m0
925
    movq [r3+r1*1], m1
926
    movq [r3+r1*2], m1
927
    movq [r4+r1*1], m1
928
    movq [r4+r1*2], m1
929
    RET
930
%endif
931

    
932
;-----------------------------------------------------------------------------
933
; void pred8x8_dc_rv40(uint8_t *src, int stride)
934
;-----------------------------------------------------------------------------
935

    
936
cglobal pred8x8_dc_rv40_mmxext, 2,7
937
    mov       r4, r0
938
    sub       r0, r1
939
    pxor      mm0, mm0
940
    psadbw    mm0, [r0]
941
    dec        r0
942
    movzx     r5d, byte [r0+r1*1]
943
    movd      r6d, mm0
944
    lea        r0, [r0+r1*2]
945
%rep 3
946
    movzx     r2d, byte [r0+r1*0]
947
    movzx     r3d, byte [r0+r1*1]
948
    add       r5d, r2d
949
    add       r6d, r3d
950
    lea        r0, [r0+r1*2]
951
%endrep
952
    movzx     r2d, byte [r0+r1*0]
953
    add       r5d, r6d
954
    lea       r2d, [r2+r5+8]
955
    shr       r2d, 4
956
    movd      mm0, r2d
957
    punpcklbw mm0, mm0
958
    pshufw    mm0, mm0, 0
959
    mov       r3d, 4
960
.loop:
961
    movq [r4+r1*0], mm0
962
    movq [r4+r1*1], mm0
963
    lea   r4, [r4+r1*2]
964
    dec   r3d
965
    jg .loop
966
    REP_RET
967

    
968
;-----------------------------------------------------------------------------
969
; void pred8x8_tm_vp8(uint8_t *src, int stride)
970
;-----------------------------------------------------------------------------
971

    
972
%macro PRED8x8_TM_MMX 1
973
cglobal pred8x8_tm_vp8_%1, 2,6
974
    sub        r0, r1
975
    pxor      mm7, mm7
976
    movq      mm0, [r0]
977
    movq      mm1, mm0
978
    punpcklbw mm0, mm7
979
    punpckhbw mm1, mm7
980
    movzx     r4d, byte [r0-1]
981
    mov       r5d, 4
982
.loop:
983
    movzx     r2d, byte [r0+r1*1-1]
984
    movzx     r3d, byte [r0+r1*2-1]
985
    sub       r2d, r4d
986
    sub       r3d, r4d
987
    movd      mm2, r2d
988
    movd      mm4, r3d
989
%ifidn %1, mmx
990
    punpcklwd mm2, mm2
991
    punpcklwd mm4, mm4
992
    punpckldq mm2, mm2
993
    punpckldq mm4, mm4
994
%else
995
    pshufw    mm2, mm2, 0
996
    pshufw    mm4, mm4, 0
997
%endif
998
    movq      mm3, mm2
999
    movq      mm5, mm4
1000
    paddw     mm2, mm0
1001
    paddw     mm3, mm1
1002
    paddw     mm4, mm0
1003
    paddw     mm5, mm1
1004
    packuswb  mm2, mm3
1005
    packuswb  mm4, mm5
1006
    movq [r0+r1*1], mm2
1007
    movq [r0+r1*2], mm4
1008
    lea        r0, [r0+r1*2]
1009
    dec       r5d
1010
    jg .loop
1011
    REP_RET
1012
%endmacro
1013

    
1014
PRED8x8_TM_MMX mmx
1015
PRED8x8_TM_MMX mmxext
1016

    
1017
cglobal pred8x8_tm_vp8_sse2, 2,6,4
1018
    sub          r0, r1
1019
    pxor       xmm1, xmm1
1020
    movq       xmm0, [r0]
1021
    punpcklbw  xmm0, xmm1
1022
    movzx       r4d, byte [r0-1]
1023
    mov         r5d, 4
1024
.loop:
1025
    movzx       r2d, byte [r0+r1*1-1]
1026
    movzx       r3d, byte [r0+r1*2-1]
1027
    sub         r2d, r4d
1028
    sub         r3d, r4d
1029
    movd       xmm2, r2d
1030
    movd       xmm3, r3d
1031
    pshuflw    xmm2, xmm2, 0
1032
    pshuflw    xmm3, xmm3, 0
1033
    punpcklqdq xmm2, xmm2
1034
    punpcklqdq xmm3, xmm3
1035
    paddw      xmm2, xmm0
1036
    paddw      xmm3, xmm0
1037
    packuswb   xmm2, xmm3
1038
    movq   [r0+r1*1], xmm2
1039
    movhps [r0+r1*2], xmm2
1040
    lea          r0, [r0+r1*2]
1041
    dec         r5d
1042
    jg .loop
1043
    REP_RET
1044

    
1045
cglobal pred8x8_tm_vp8_ssse3, 2,3,6
1046
    sub          r0, r1
1047
    movdqa     xmm4, [tm_shuf]
1048
    pxor       xmm1, xmm1
1049
    movq       xmm0, [r0]
1050
    punpcklbw  xmm0, xmm1
1051
    movd       xmm5, [r0-4]
1052
    pshufb     xmm5, xmm4
1053
    mov         r2d, 4
1054
.loop:
1055
    movd       xmm2, [r0+r1*1-4]
1056
    movd       xmm3, [r0+r1*2-4]
1057
    pshufb     xmm2, xmm4
1058
    pshufb     xmm3, xmm4
1059
    psubw      xmm2, xmm5
1060
    psubw      xmm3, xmm5
1061
    paddw      xmm2, xmm0
1062
    paddw      xmm3, xmm0
1063
    packuswb   xmm2, xmm3
1064
    movq   [r0+r1*1], xmm2
1065
    movhps [r0+r1*2], xmm2
1066
    lea          r0, [r0+r1*2]
1067
    dec         r2d
1068
    jg .loop
1069
    REP_RET
1070

    
1071
; dest, left, right, src, tmp
1072
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1073
%macro PRED4x4_LOWPASS 5
1074
    mova    %5, %2
1075
    pavgb   %2, %3
1076
    pxor    %3, %5
1077
    mova    %1, %4
1078
    pand    %3, [pb_1]
1079
    psubusb %2, %3
1080
    pavgb   %1, %2
1081
%endmacro
1082

    
1083
;-----------------------------------------------------------------------------
1084
; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1085
;-----------------------------------------------------------------------------
1086
%ifdef CONFIG_GPL
1087
%macro PRED8x8L_TOP_DC 1
1088
cglobal pred8x8l_top_dc_%1, 4,4
1089
    sub          r0, r3
1090
    pxor        mm7, mm7
1091
    movq        mm0, [r0-8]
1092
    movq        mm3, [r0]
1093
    movq        mm1, [r0+8]
1094
    movq        mm2, mm3
1095
    movq        mm4, mm3
1096
    PALIGNR     mm2, mm0, 7, mm0
1097
    PALIGNR     mm1, mm4, 1, mm4
1098
    test         r1, r1 ; top_left
1099
    jz .fix_lt_2
1100
    test         r2, r2 ; top_right
1101
    jz .fix_tr_1
1102
    jmp .body
1103
.fix_lt_2:
1104
    movq        mm5, mm3
1105
    pxor        mm5, mm2
1106
    psllq       mm5, 56
1107
    psrlq       mm5, 56
1108
    pxor        mm2, mm5
1109
    test         r2, r2 ; top_right
1110
    jnz .body
1111
.fix_tr_1:
1112
    movq        mm5, mm3
1113
    pxor        mm5, mm1
1114
    psrlq       mm5, 56
1115
    psllq       mm5, 56
1116
    pxor        mm1, mm5
1117
.body
1118
    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1119
    psadbw   mm7, mm0
1120
    paddw    mm7, [pw_4]
1121
    psrlw    mm7, 3
1122
    pshufw   mm7, mm7, 0
1123
    packuswb mm7, mm7
1124
%rep 3
1125
    movq [r0+r3*1], mm7
1126
    movq [r0+r3*2], mm7
1127
    lea    r0, [r0+r3*2]
1128
%endrep
1129
    movq [r0+r3*1], mm7
1130
    movq [r0+r3*2], mm7
1131
    RET
1132
%endmacro
1133

    
1134
INIT_MMX
1135
%define PALIGNR PALIGNR_MMX
1136
PRED8x8L_TOP_DC mmxext
1137
%define PALIGNR PALIGNR_SSSE3
1138
PRED8x8L_TOP_DC ssse3
1139

    
1140
;-----------------------------------------------------------------------------
1141
;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1142
;-----------------------------------------------------------------------------
1143

    
1144
%macro PRED8x8L_DC 1
1145
cglobal pred8x8l_dc_%1, 4,5
1146
    sub          r0, r3
1147
    lea          r4, [r0+r3*2]
1148
    movq        mm0, [r0+r3*1-8]
1149
    punpckhbw   mm0, [r0+r3*0-8]
1150
    movq        mm1, [r4+r3*1-8]
1151
    punpckhbw   mm1, [r0+r3*2-8]
1152
    mov          r4, r0
1153
    punpckhwd   mm1, mm0
1154
    lea          r0, [r0+r3*4]
1155
    movq        mm2, [r0+r3*1-8]
1156
    punpckhbw   mm2, [r0+r3*0-8]
1157
    lea          r0, [r0+r3*2]
1158
    movq        mm3, [r0+r3*1-8]
1159
    punpckhbw   mm3, [r0+r3*0-8]
1160
    punpckhwd   mm3, mm2
1161
    punpckhdq   mm3, mm1
1162
    lea          r0, [r0+r3*2]
1163
    movq        mm0, [r0+r3*0-8]
1164
    movq        mm1, [r4]
1165
    mov          r0, r4
1166
    movq        mm4, mm3
1167
    movq        mm2, mm3
1168
    PALIGNR     mm4, mm0, 7, mm0
1169
    PALIGNR     mm1, mm2, 1, mm2
1170
    test        r1, r1
1171
    jnz .do_left
1172
.fix_lt_1:
1173
    movq        mm5, mm3
1174
    pxor        mm5, mm4
1175
    psrlq       mm5, 56
1176
    psllq       mm5, 48
1177
    pxor        mm1, mm5
1178
    jmp .do_left
1179
.fix_lt_2:
1180
    movq        mm5, mm3
1181
    pxor        mm5, mm2
1182
    psllq       mm5, 56
1183
    psrlq       mm5, 56
1184
    pxor        mm2, mm5
1185
    test         r2, r2
1186
    jnz .body
1187
.fix_tr_1:
1188
    movq        mm5, mm3
1189
    pxor        mm5, mm1
1190
    psrlq       mm5, 56
1191
    psllq       mm5, 56
1192
    pxor        mm1, mm5
1193
    jmp .body
1194
.do_left:
1195
    movq        mm0, mm4
1196
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1197
    movq        mm4, mm0
1198
    movq        mm7, mm2
1199
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1200
    psllq       mm1, 56
1201
    PALIGNR     mm7, mm1, 7, mm3
1202
    movq        mm0, [r0-8]
1203
    movq        mm3, [r0]
1204
    movq        mm1, [r0+8]
1205
    movq        mm2, mm3
1206
    movq        mm4, mm3
1207
    PALIGNR     mm2, mm0, 7, mm0
1208
    PALIGNR     mm1, mm4, 1, mm4
1209
    test         r1, r1
1210
    jz .fix_lt_2
1211
    test         r2, r2
1212
    jz .fix_tr_1
1213
.body
1214
    lea          r1, [r0+r3*2]
1215
    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1216
    pxor        mm0, mm0
1217
    pxor        mm1, mm1
1218
    lea          r2, [r1+r3*2]
1219
    psadbw      mm0, mm7
1220
    psadbw      mm1, mm6
1221
    paddw       mm0, [pw_8]
1222
    paddw       mm0, mm1
1223
    lea          r4, [r2+r3*2]
1224
    psrlw       mm0, 4
1225
    pshufw      mm0, mm0, 0
1226
    packuswb    mm0, mm0
1227
    movq [r0+r3*1], mm0
1228
    movq [r0+r3*2], mm0
1229
    movq [r1+r3*1], mm0
1230
    movq [r1+r3*2], mm0
1231
    movq [r2+r3*1], mm0
1232
    movq [r2+r3*2], mm0
1233
    movq [r4+r3*1], mm0
1234
    movq [r4+r3*2], mm0
1235
    RET
1236
%endmacro
1237
INIT_MMX
1238
%define PALIGNR PALIGNR_MMX
1239
PRED8x8L_DC mmxext
1240
%define PALIGNR PALIGNR_SSSE3
1241
PRED8x8L_DC ssse3
1242

    
1243
;-----------------------------------------------------------------------------
1244
; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
1245
;-----------------------------------------------------------------------------
1246

    
1247
%macro PRED8x8L_HORIZONTAL 1
1248
cglobal pred8x8l_horizontal_%1, 4,4
1249
    sub          r0, r3
1250
    lea          r2, [r0+r3*2]
1251
    movq        mm0, [r0+r3*1-8]
1252
    punpckhbw   mm0, [r0+r3*0-8]
1253
    movq        mm1, [r2+r3*1-8]
1254
    punpckhbw   mm1, [r0+r3*2-8]
1255
    mov          r2, r0
1256
    punpckhwd   mm1, mm0
1257
    lea          r0, [r0+r3*4]
1258
    movq        mm2, [r0+r3*1-8]
1259
    punpckhbw   mm2, [r0+r3*0-8]
1260
    lea          r0, [r0+r3*2]
1261
    movq        mm3, [r0+r3*1-8]
1262
    punpckhbw   mm3, [r0+r3*0-8]
1263
    punpckhwd   mm3, mm2
1264
    punpckhdq   mm3, mm1
1265
    lea          r0, [r0+r3*2]
1266
    movq        mm0, [r0+r3*0-8]
1267
    movq        mm1, [r2]
1268
    mov          r0, r2
1269
    movq        mm4, mm3
1270
    movq        mm2, mm3
1271
    PALIGNR     mm4, mm0, 7, mm0
1272
    PALIGNR     mm1, mm2, 1, mm2
1273
    test        r1, r1 ; top_left
1274
    jnz .do_left
1275
.fix_lt_1:
1276
    movq        mm5, mm3
1277
    pxor        mm5, mm4
1278
    psrlq       mm5, 56
1279
    psllq       mm5, 48
1280
    pxor        mm1, mm5
1281
.do_left:
1282
    movq        mm0, mm4
1283
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1284
    movq        mm4, mm0
1285
    movq        mm7, mm2
1286
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1287
    psllq       mm1, 56
1288
    PALIGNR     mm7, mm1, 7, mm3
1289
    movq        mm3, mm7
1290
    lea         r1, [r0+r3*2]
1291
    movq       mm7, mm3
1292
    punpckhbw  mm3, mm3
1293
    punpcklbw  mm7, mm7
1294
    pshufw     mm0, mm3, 0xff
1295
    pshufw     mm1, mm3, 0xaa
1296
    lea         r2, [r1+r3*2]
1297
    pshufw     mm2, mm3, 0x55
1298
    pshufw     mm3, mm3, 0x00
1299
    pshufw     mm4, mm7, 0xff
1300
    pshufw     mm5, mm7, 0xaa
1301
    pshufw     mm6, mm7, 0x55
1302
    pshufw     mm7, mm7, 0x00
1303
    movq [r0+r3*1], mm0
1304
    movq [r0+r3*2], mm1
1305
    movq [r1+r3*1], mm2
1306
    movq [r1+r3*2], mm3
1307
    movq [r2+r3*1], mm4
1308
    movq [r2+r3*2], mm5
1309
    lea         r0, [r2+r3*2]
1310
    movq [r0+r3*1], mm6
1311
    movq [r0+r3*2], mm7
1312
    RET
1313
%endmacro
1314

    
1315
INIT_MMX
1316
%define PALIGNR PALIGNR_MMX
1317
PRED8x8L_HORIZONTAL mmxext
1318
%define PALIGNR PALIGNR_SSSE3
1319
PRED8x8L_HORIZONTAL ssse3
1320

    
1321
;-----------------------------------------------------------------------------
1322
; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
1323
;-----------------------------------------------------------------------------
1324

    
1325
%macro PRED8x8L_VERTICAL 1
1326
cglobal pred8x8l_vertical_%1, 4,4
1327
    sub          r0, r3
1328
    movq        mm0, [r0-8]
1329
    movq        mm3, [r0]
1330
    movq        mm1, [r0+8]
1331
    movq        mm2, mm3
1332
    movq        mm4, mm3
1333
    PALIGNR     mm2, mm0, 7, mm0
1334
    PALIGNR     mm1, mm4, 1, mm4
1335
    test         r1, r1 ; top_left
1336
    jz .fix_lt_2
1337
    test         r2, r2 ; top_right
1338
    jz .fix_tr_1
1339
    jmp .body
1340
.fix_lt_2:
1341
    movq        mm5, mm3
1342
    pxor        mm5, mm2
1343
    psllq       mm5, 56
1344
    psrlq       mm5, 56
1345
    pxor        mm2, mm5
1346
    test         r2, r2 ; top_right
1347
    jnz .body
1348
.fix_tr_1:
1349
    movq        mm5, mm3
1350
    pxor        mm5, mm1
1351
    psrlq       mm5, 56
1352
    psllq       mm5, 56
1353
    pxor        mm1, mm5
1354
.body
1355
    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1356
%rep 3
1357
    movq [r0+r3*1], mm0
1358
    movq [r0+r3*2], mm0
1359
    lea    r0, [r0+r3*2]
1360
%endrep
1361
    movq [r0+r3*1], mm0
1362
    movq [r0+r3*2], mm0
1363
    RET
1364
%endmacro
1365

    
1366
INIT_MMX
1367
%define PALIGNR PALIGNR_MMX
1368
PRED8x8L_VERTICAL mmxext
1369
%define PALIGNR PALIGNR_SSSE3
1370
PRED8x8L_VERTICAL ssse3
1371

    
1372
;-----------------------------------------------------------------------------
1373
;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1374
;-----------------------------------------------------------------------------
1375

    
1376
INIT_MMX
1377
%define PALIGNR PALIGNR_MMX
1378
cglobal pred8x8l_down_left_mmxext, 4,5
1379
    sub          r0, r3
1380
    movq        mm0, [r0-8]
1381
    movq        mm3, [r0]
1382
    movq        mm1, [r0+8]
1383
    movq        mm2, mm3
1384
    movq        mm4, mm3
1385
    PALIGNR     mm2, mm0, 7, mm0
1386
    PALIGNR     mm1, mm4, 1, mm4
1387
    test         r1, r1
1388
    jz .fix_lt_2
1389
    test         r2, r2
1390
    jz .fix_tr_1
1391
    jmp .do_top
1392
.fix_lt_2:
1393
    movq        mm5, mm3
1394
    pxor        mm5, mm2
1395
    psllq       mm5, 56
1396
    psrlq       mm5, 56
1397
    pxor        mm2, mm5
1398
    test         r2, r2
1399
    jnz .do_top
1400
.fix_tr_1:
1401
    movq        mm5, mm3
1402
    pxor        mm5, mm1
1403
    psrlq       mm5, 56
1404
    psllq       mm5, 56
1405
    pxor        mm1, mm5
1406
    jmp .do_top
1407
.fix_tr_2:
1408
    punpckhbw   mm3, mm3
1409
    pshufw      mm1, mm3, 0xFF
1410
    jmp .do_topright
1411
.do_top:
1412
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1413
    movq        mm7, mm4
1414
    test         r2, r2
1415
    jz .fix_tr_2
1416
    movq        mm0, [r0+8]
1417
    movq        mm5, mm0
1418
    movq        mm2, mm0
1419
    movq        mm4, mm0
1420
    psrlq       mm5, 56
1421
    PALIGNR     mm2, mm3, 7, mm3
1422
    PALIGNR     mm5, mm4, 1, mm4
1423
    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1424
.do_topright:
1425
    lea          r1, [r0+r3*2]
1426
    movq        mm6, mm1
1427
    psrlq       mm1, 56
1428
    movq        mm4, mm1
1429
    lea          r2, [r1+r3*2]
1430
    movq        mm2, mm6
1431
    PALIGNR     mm2, mm7, 1, mm0
1432
    movq        mm3, mm6
1433
    PALIGNR     mm3, mm7, 7, mm0
1434
    PALIGNR     mm4, mm6, 1, mm0
1435
    movq        mm5, mm7
1436
    movq        mm1, mm7
1437
    movq        mm7, mm6
1438
    lea          r4, [r2+r3*2]
1439
    psllq       mm1, 8
1440
    PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1441
    PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1442
    movq  [r4+r3*2], mm1
1443
    movq        mm2, mm0
1444
    psllq       mm1, 8
1445
    psrlq       mm2, 56
1446
    psllq       mm0, 8
1447
    por         mm1, mm2
1448
    movq  [r4+r3*1], mm1
1449
    movq        mm2, mm0
1450
    psllq       mm1, 8
1451
    psrlq       mm2, 56
1452
    psllq       mm0, 8
1453
    por         mm1, mm2
1454
    movq  [r2+r3*2], mm1
1455
    movq        mm2, mm0
1456
    psllq       mm1, 8
1457
    psrlq       mm2, 56
1458
    psllq       mm0, 8
1459
    por         mm1, mm2
1460
    movq  [r2+r3*1], mm1
1461
    movq        mm2, mm0
1462
    psllq       mm1, 8
1463
    psrlq       mm2, 56
1464
    psllq       mm0, 8
1465
    por         mm1, mm2
1466
    movq  [r1+r3*2], mm1
1467
    movq        mm2, mm0
1468
    psllq       mm1, 8
1469
    psrlq       mm2, 56
1470
    psllq       mm0, 8
1471
    por         mm1, mm2
1472
    movq  [r1+r3*1], mm1
1473
    movq        mm2, mm0
1474
    psllq       mm1, 8
1475
    psrlq       mm2, 56
1476
    psllq       mm0, 8
1477
    por         mm1, mm2
1478
    movq  [r0+r3*2], mm1
1479
    psllq       mm1, 8
1480
    psrlq       mm0, 56
1481
    por         mm1, mm0
1482
    movq  [r0+r3*1], mm1
1483
    RET
1484

    
1485
%macro PRED8x8L_DOWN_LEFT 1
1486
cglobal pred8x8l_down_left_%1, 4,4
1487
    sub          r0, r3
1488
    movq        mm0, [r0-8]
1489
    movq        mm3, [r0]
1490
    movq        mm1, [r0+8]
1491
    movq        mm2, mm3
1492
    movq        mm4, mm3
1493
    PALIGNR     mm2, mm0, 7, mm0
1494
    PALIGNR     mm1, mm4, 1, mm4
1495
    test         r1, r1 ; top_left
1496
    jz .fix_lt_2
1497
    test         r2, r2 ; top_right
1498
    jz .fix_tr_1
1499
    jmp .do_top
1500
.fix_lt_2:
1501
    movq        mm5, mm3
1502
    pxor        mm5, mm2
1503
    psllq       mm5, 56
1504
    psrlq       mm5, 56
1505
    pxor        mm2, mm5
1506
    test         r2, r2 ; top_right
1507
    jnz .do_top
1508
.fix_tr_1:
1509
    movq        mm5, mm3
1510
    pxor        mm5, mm1
1511
    psrlq       mm5, 56
1512
    psllq       mm5, 56
1513
    pxor        mm1, mm5
1514
    jmp .do_top
1515
.fix_tr_2:
1516
    punpckhbw   mm3, mm3
1517
    pshufw      mm1, mm3, 0xFF
1518
    jmp .do_topright
1519
.do_top:
1520
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1521
    movq2dq    xmm3, mm4
1522
    test         r2, r2 ; top_right
1523
    jz .fix_tr_2
1524
    movq        mm0, [r0+8]
1525
    movq        mm5, mm0
1526
    movq        mm2, mm0
1527
    movq        mm4, mm0
1528
    psrlq       mm5, 56
1529
    PALIGNR     mm2, mm3, 7, mm3
1530
    PALIGNR     mm5, mm4, 1, mm4
1531
    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1532
.do_topright:
1533
    movq2dq    xmm4, mm1
1534
    psrlq       mm1, 56
1535
    movq2dq    xmm5, mm1
1536
    lea         r1, [r0+r3*2]
1537
    pslldq    xmm4, 8
1538
    por       xmm3, xmm4
1539
    movdqa    xmm2, xmm3
1540
    psrldq    xmm2, 1
1541
    pslldq    xmm5, 15
1542
    por       xmm2, xmm5
1543
    lea         r2, [r1+r3*2]
1544
    movdqa    xmm1, xmm3
1545
    pslldq    xmm1, 1
1546
INIT_XMM
1547
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1548
    psrldq    xmm0, 1
1549
    movq [r0+r3*1], xmm0
1550
    psrldq    xmm0, 1
1551
    movq [r0+r3*2], xmm0
1552
    psrldq    xmm0, 1
1553
    lea         r0, [r2+r3*2]
1554
    movq [r1+r3*1], xmm0
1555
    psrldq    xmm0, 1
1556
    movq [r1+r3*2], xmm0
1557
    psrldq    xmm0, 1
1558
    movq [r2+r3*1], xmm0
1559
    psrldq    xmm0, 1
1560
    movq [r2+r3*2], xmm0
1561
    psrldq    xmm0, 1
1562
    movq [r0+r3*1], xmm0
1563
    psrldq    xmm0, 1
1564
    movq [r0+r3*2], xmm0
1565
    RET
1566
%endmacro
1567

    
1568
INIT_MMX
1569
%define PALIGNR PALIGNR_MMX
1570
PRED8x8L_DOWN_LEFT sse2
1571
INIT_MMX
1572
%define PALIGNR PALIGNR_SSSE3
1573
PRED8x8L_DOWN_LEFT ssse3
1574

    
1575
;-----------------------------------------------------------------------------
1576
;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
1577
;-----------------------------------------------------------------------------
1578

    
1579
INIT_MMX
1580
%define PALIGNR PALIGNR_MMX
1581
cglobal pred8x8l_down_right_mmxext, 4,5
1582
    sub          r0, r3
1583
    lea          r4, [r0+r3*2]
1584
    movq        mm0, [r0+r3*1-8]
1585
    punpckhbw   mm0, [r0+r3*0-8]
1586
    movq        mm1, [r4+r3*1-8]
1587
    punpckhbw   mm1, [r0+r3*2-8]
1588
    mov          r4, r0
1589
    punpckhwd   mm1, mm0
1590
    lea          r0, [r0+r3*4]
1591
    movq        mm2, [r0+r3*1-8]
1592
    punpckhbw   mm2, [r0+r3*0-8]
1593
    lea          r0, [r0+r3*2]
1594
    movq        mm3, [r0+r3*1-8]
1595
    punpckhbw   mm3, [r0+r3*0-8]
1596
    punpckhwd   mm3, mm2
1597
    punpckhdq   mm3, mm1
1598
    lea          r0, [r0+r3*2]
1599
    movq        mm0, [r0+r3*0-8]
1600
    movq        mm1, [r4]
1601
    mov          r0, r4
1602
    movq        mm4, mm3
1603
    movq        mm2, mm3
1604
    PALIGNR     mm4, mm0, 7, mm0
1605
    PALIGNR     mm1, mm2, 1, mm2
1606
    test        r1, r1 ; top_left
1607
    jz .fix_lt_1
1608
.do_left:
1609
    movq        mm0, mm4
1610
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1611
    movq        mm4, mm0
1612
    movq        mm7, mm2
1613
    movq        mm6, mm2
1614
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1615
    psllq       mm1, 56
1616
    PALIGNR     mm7, mm1, 7, mm3
1617
    movq        mm0, [r0-8]
1618
    movq        mm3, [r0]
1619
    movq        mm1, [r0+8]
1620
    movq        mm2, mm3
1621
    movq        mm4, mm3
1622
    PALIGNR     mm2, mm0, 7, mm0
1623
    PALIGNR     mm1, mm4, 1, mm4
1624
    test         r1, r1 ; top_left
1625
    jz .fix_lt_2
1626
    test         r2, r2 ; top_right
1627
    jz .fix_tr_1
1628
.do_top:
1629
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1630
    movq        mm5, mm4
1631
    jmp .body
1632
.fix_lt_1:
1633
    movq        mm5, mm3
1634
    pxor        mm5, mm4
1635
    psrlq       mm5, 56
1636
    psllq       mm5, 48
1637
    pxor        mm1, mm5
1638
    jmp .do_left
1639
.fix_lt_2:
1640
    movq        mm5, mm3
1641
    pxor        mm5, mm2
1642
    psllq       mm5, 56
1643
    psrlq       mm5, 56
1644
    pxor        mm2, mm5
1645
    test         r2, r2 ; top_right
1646
    jnz .do_top
1647
.fix_tr_1:
1648
    movq        mm5, mm3
1649
    pxor        mm5, mm1
1650
    psrlq       mm5, 56
1651
    psllq       mm5, 56
1652
    pxor        mm1, mm5
1653
    jmp .do_top
1654
.body
1655
    lea         r1, [r0+r3*2]
1656
    movq       mm1, mm7
1657
    movq       mm7, mm5
1658
    movq       mm5, mm6
1659
    movq       mm2, mm7
1660
    lea         r2, [r1+r3*2]
1661
    PALIGNR    mm2, mm6, 1, mm0
1662
    movq       mm3, mm7
1663
    PALIGNR    mm3, mm6, 7, mm0
1664
    movq       mm4, mm7
1665
    lea         r4, [r2+r3*2]
1666
    psrlq      mm4, 8
1667
    PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1668
    PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1669
    movq [r4+r3*2], mm0
1670
    movq       mm2, mm1
1671
    psrlq      mm0, 8
1672
    psllq      mm2, 56
1673
    psrlq      mm1, 8
1674
    por        mm0, mm2
1675
    movq [r4+r3*1], mm0
1676
    movq       mm2, mm1
1677
    psrlq      mm0, 8
1678
    psllq      mm2, 56
1679
    psrlq      mm1, 8
1680
    por        mm0, mm2
1681
    movq [r2+r3*2], mm0
1682
    movq       mm2, mm1
1683
    psrlq      mm0, 8
1684
    psllq      mm2, 56
1685
    psrlq      mm1, 8
1686
    por        mm0, mm2
1687
    movq [r2+r3*1], mm0
1688
    movq       mm2, mm1
1689
    psrlq      mm0, 8
1690
    psllq      mm2, 56
1691
    psrlq      mm1, 8
1692
    por        mm0, mm2
1693
    movq [r1+r3*2], mm0
1694
    movq       mm2, mm1
1695
    psrlq      mm0, 8
1696
    psllq      mm2, 56
1697
    psrlq      mm1, 8
1698
    por        mm0, mm2
1699
    movq [r1+r3*1], mm0
1700
    movq       mm2, mm1
1701
    psrlq      mm0, 8
1702
    psllq      mm2, 56
1703
    psrlq      mm1, 8
1704
    por        mm0, mm2
1705
    movq [r0+r3*2], mm0
1706
    psrlq      mm0, 8
1707
    psllq      mm1, 56
1708
    por        mm0, mm1
1709
    movq [r0+r3*1], mm0
1710
    RET
1711

    
1712
%macro PRED8x8L_DOWN_RIGHT 1
1713
cglobal pred8x8l_down_right_%1, 4,5
1714
    sub          r0, r3
1715
    lea          r4, [r0+r3*2]
1716
    movq        mm0, [r0+r3*1-8]
1717
    punpckhbw   mm0, [r0+r3*0-8]
1718
    movq        mm1, [r4+r3*1-8]
1719
    punpckhbw   mm1, [r0+r3*2-8]
1720
    mov          r4, r0
1721
    punpckhwd   mm1, mm0
1722
    lea          r0, [r0+r3*4]
1723
    movq        mm2, [r0+r3*1-8]
1724
    punpckhbw   mm2, [r0+r3*0-8]
1725
    lea          r0, [r0+r3*2]
1726
    movq        mm3, [r0+r3*1-8]
1727
    punpckhbw   mm3, [r0+r3*0-8]
1728
    punpckhwd   mm3, mm2
1729
    punpckhdq   mm3, mm1
1730
    lea          r0, [r0+r3*2]
1731
    movq        mm0, [r0+r3*0-8]
1732
    movq        mm1, [r4]
1733
    mov          r0, r4
1734
    movq        mm4, mm3
1735
    movq        mm2, mm3
1736
    PALIGNR     mm4, mm0, 7, mm0
1737
    PALIGNR     mm1, mm2, 1, mm2
1738
    test        r1, r1
1739
    jz .fix_lt_1
1740
    jmp .do_left
1741
.fix_lt_1:
1742
    movq        mm5, mm3
1743
    pxor        mm5, mm4
1744
    psrlq       mm5, 56
1745
    psllq       mm5, 48
1746
    pxor        mm1, mm5
1747
    jmp .do_left
1748
.fix_lt_2:
1749
    movq        mm5, mm3
1750
    pxor        mm5, mm2
1751
    psllq       mm5, 56
1752
    psrlq       mm5, 56
1753
    pxor        mm2, mm5
1754
    test         r2, r2
1755
    jnz .do_top
1756
.fix_tr_1:
1757
    movq        mm5, mm3
1758
    pxor        mm5, mm1
1759
    psrlq       mm5, 56
1760
    psllq       mm5, 56
1761
    pxor        mm1, mm5
1762
    jmp .do_top
1763
.do_left:
1764
    movq        mm0, mm4
1765
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1766
    movq        mm4, mm0
1767
    movq        mm7, mm2
1768
    movq2dq    xmm3, mm2
1769
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1770
    psllq       mm1, 56
1771
    PALIGNR     mm7, mm1, 7, mm3
1772
    movq2dq    xmm1, mm7
1773
    movq        mm0, [r0-8]
1774
    movq        mm3, [r0]
1775
    movq        mm1, [r0+8]
1776
    movq        mm2, mm3
1777
    movq        mm4, mm3
1778
    PALIGNR     mm2, mm0, 7, mm0
1779
    PALIGNR     mm1, mm4, 1, mm4
1780
    test         r1, r1
1781
    jz .fix_lt_2
1782
    test         r2, r2
1783
    jz .fix_tr_1
1784
.do_top:
1785
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1786
    movq2dq   xmm4, mm4
1787
    lea         r1, [r0+r3*2]
1788
    movdqa    xmm0, xmm3
1789
    pslldq    xmm4, 8
1790
    por       xmm3, xmm4
1791
    lea         r2, [r1+r3*2]
1792
    pslldq    xmm4, 1
1793
    por       xmm1, xmm4
1794
    psrldq    xmm0, 7
1795
    pslldq    xmm0, 15
1796
    psrldq    xmm0, 7
1797
    por       xmm1, xmm0
1798
    lea         r0, [r2+r3*2]
1799
    movdqa    xmm2, xmm3
1800
    psrldq    xmm2, 1
1801
INIT_XMM
1802
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1803
    movdqa    xmm1, xmm0
1804
    psrldq    xmm1, 1
1805
    movq [r0+r3*2], xmm0
1806
    movq [r0+r3*1], xmm1
1807
    psrldq    xmm0, 2
1808
    psrldq    xmm1, 2
1809
    movq [r2+r3*2], xmm0
1810
    movq [r2+r3*1], xmm1
1811
    psrldq    xmm0, 2
1812
    psrldq    xmm1, 2
1813
    movq [r1+r3*2], xmm0
1814
    movq [r1+r3*1], xmm1
1815
    psrldq    xmm0, 2
1816
    psrldq    xmm1, 2
1817
    movq [r4+r3*2], xmm0
1818
    movq [r4+r3*1], xmm1
1819
    RET
1820
%endmacro
1821

    
1822
INIT_MMX
1823
%define PALIGNR PALIGNR_MMX
1824
PRED8x8L_DOWN_RIGHT sse2
1825
INIT_MMX
1826
%define PALIGNR PALIGNR_SSSE3
1827
PRED8x8L_DOWN_RIGHT ssse3
1828

    
1829
;-----------------------------------------------------------------------------
1830
; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
1831
;-----------------------------------------------------------------------------
1832

    
1833
INIT_MMX
1834
%define PALIGNR PALIGNR_MMX
1835
cglobal pred8x8l_vertical_right_mmxext, 4,5
1836
    sub          r0, r3
1837
    lea          r4, [r0+r3*2]
1838
    movq        mm0, [r0+r3*1-8]
1839
    punpckhbw   mm0, [r0+r3*0-8]
1840
    movq        mm1, [r4+r3*1-8]
1841
    punpckhbw   mm1, [r0+r3*2-8]
1842
    mov          r4, r0
1843
    punpckhwd   mm1, mm0
1844
    lea          r0, [r0+r3*4]
1845
    movq        mm2, [r0+r3*1-8]
1846
    punpckhbw   mm2, [r0+r3*0-8]
1847
    lea          r0, [r0+r3*2]
1848
    movq        mm3, [r0+r3*1-8]
1849
    punpckhbw   mm3, [r0+r3*0-8]
1850
    punpckhwd   mm3, mm2
1851
    punpckhdq   mm3, mm1
1852
    lea          r0, [r0+r3*2]
1853
    movq        mm0, [r0+r3*0-8]
1854
    movq        mm1, [r4]
1855
    mov          r0, r4
1856
    movq        mm4, mm3
1857
    movq        mm2, mm3
1858
    PALIGNR     mm4, mm0, 7, mm0
1859
    PALIGNR     mm1, mm2, 1, mm2
1860
    test        r1, r1
1861
    jz .fix_lt_1
1862
    jmp .do_left
1863
.fix_lt_1:
1864
    movq        mm5, mm3
1865
    pxor        mm5, mm4
1866
    psrlq       mm5, 56
1867
    psllq       mm5, 48
1868
    pxor        mm1, mm5
1869
    jmp .do_left
1870
.fix_lt_2:
1871
    movq        mm5, mm3
1872
    pxor        mm5, mm2
1873
    psllq       mm5, 56
1874
    psrlq       mm5, 56
1875
    pxor        mm2, mm5
1876
    test         r2, r2
1877
    jnz .do_top
1878
.fix_tr_1:
1879
    movq        mm5, mm3
1880
    pxor        mm5, mm1
1881
    psrlq       mm5, 56
1882
    psllq       mm5, 56
1883
    pxor        mm1, mm5
1884
    jmp .do_top
1885
.do_left:
1886
    movq        mm0, mm4
1887
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1888
    movq        mm7, mm2
1889
    movq        mm0, [r0-8]
1890
    movq        mm3, [r0]
1891
    movq        mm1, [r0+8]
1892
    movq        mm2, mm3
1893
    movq        mm4, mm3
1894
    PALIGNR     mm2, mm0, 7, mm0
1895
    PALIGNR     mm1, mm4, 1, mm4
1896
    test         r1, r1
1897
    jz .fix_lt_2
1898
    test         r2, r2
1899
    jz .fix_tr_1
1900
.do_top
1901
    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1902
    lea         r1, [r0+r3*2]
1903
    movq       mm2, mm6
1904
    movq       mm3, mm6
1905
    PALIGNR    mm3, mm7, 7, mm0
1906
    PALIGNR    mm6, mm7, 6, mm1
1907
    movq       mm4, mm3
1908
    pavgb      mm3, mm2
1909
    lea         r2, [r1+r3*2]
1910
    PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1911
    movq [r0+r3*1], mm3
1912
    movq [r0+r3*2], mm0
1913
    movq       mm5, mm0
1914
    movq       mm6, mm3
1915
    movq       mm1, mm7
1916
    movq       mm2, mm1
1917
    psllq      mm2, 8
1918
    movq       mm3, mm1
1919
    psllq      mm3, 16
1920
    lea         r4, [r2+r3*2]
1921
    PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1922
    PALIGNR    mm6, mm0, 7, mm2
1923
    movq [r1+r3*1], mm6
1924
    psllq      mm0, 8
1925
    PALIGNR    mm5, mm0, 7, mm1
1926
    movq [r1+r3*2], mm5
1927
    psllq      mm0, 8
1928
    PALIGNR    mm6, mm0, 7, mm2
1929
    movq [r2+r3*1], mm6
1930
    psllq      mm0, 8
1931
    PALIGNR    mm5, mm0, 7, mm1
1932
    movq [r2+r3*2], mm5
1933
    psllq      mm0, 8
1934
    PALIGNR    mm6, mm0, 7, mm2
1935
    movq [r4+r3*1], mm6
1936
    psllq      mm0, 8
1937
    PALIGNR    mm5, mm0, 7, mm1
1938
    movq [r4+r3*2], mm5
1939
    RET
1940

    
1941
%macro PRED8x8L_VERTICAL_RIGHT 1
1942
cglobal pred8x8l_vertical_right_%1, 4,5,7
1943
    sub          r0, r3
1944
    lea          r4, [r0+r3*2]
1945
    movq        mm0, [r0+r3*1-8]
1946
    punpckhbw   mm0, [r0+r3*0-8]
1947
    movq        mm1, [r4+r3*1-8]
1948
    punpckhbw   mm1, [r0+r3*2-8]
1949
    mov          r4, r0
1950
    punpckhwd   mm1, mm0
1951
    lea          r0, [r0+r3*4]
1952
    movq        mm2, [r0+r3*1-8]
1953
    punpckhbw   mm2, [r0+r3*0-8]
1954
    lea          r0, [r0+r3*2]
1955
    movq        mm3, [r0+r3*1-8]
1956
    punpckhbw   mm3, [r0+r3*0-8]
1957
    punpckhwd   mm3, mm2
1958
    punpckhdq   mm3, mm1
1959
    lea          r0, [r0+r3*2]
1960
    movq        mm0, [r0+r3*0-8]
1961
    movq        mm1, [r4]
1962
    mov          r0, r4
1963
    movq        mm4, mm3
1964
    movq        mm2, mm3
1965
    PALIGNR     mm4, mm0, 7, mm0
1966
    PALIGNR     mm1, mm2, 1, mm2
1967
    test        r1, r1
1968
    jnz .do_left
1969
.fix_lt_1:
1970
    movq        mm5, mm3
1971
    pxor        mm5, mm4
1972
    psrlq       mm5, 56
1973
    psllq       mm5, 48
1974
    pxor        mm1, mm5
1975
    jmp .do_left
1976
.fix_lt_2:
1977
    movq        mm5, mm3
1978
    pxor        mm5, mm2
1979
    psllq       mm5, 56
1980
    psrlq       mm5, 56
1981
    pxor        mm2, mm5
1982
    test         r2, r2
1983
    jnz .do_top
1984
.fix_tr_1:
1985
    movq        mm5, mm3
1986
    pxor        mm5, mm1
1987
    psrlq       mm5, 56
1988
    psllq       mm5, 56
1989
    pxor        mm1, mm5
1990
    jmp .do_top
1991
.do_left:
1992
    movq        mm0, mm4
1993
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1994
    movq2dq    xmm0, mm2
1995
    movq        mm0, [r0-8]
1996
    movq        mm3, [r0]
1997
    movq        mm1, [r0+8]
1998
    movq        mm2, mm3
1999
    movq        mm4, mm3
2000
    PALIGNR     mm2, mm0, 7, mm0
2001
    PALIGNR     mm1, mm4, 1, mm4
2002
    test         r1, r1
2003
    jz .fix_lt_2
2004
    test         r2, r2
2005
    jz .fix_tr_1
2006
.do_top
2007
    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
2008
    lea           r1, [r0+r3*2]
2009
    movq2dq     xmm4, mm6
2010
    pslldq      xmm4, 8
2011
    por         xmm0, xmm4
2012
    movdqa      xmm6, [pw_ff00]
2013
    movdqa      xmm1, xmm0
2014
    lea           r2, [r1+r3*2]
2015
    movdqa      xmm2, xmm0
2016
    movdqa      xmm3, xmm0
2017
    pslldq      xmm0, 1
2018
    pslldq      xmm1, 2
2019
    pavgb       xmm2, xmm0
2020
INIT_XMM
2021
    PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
2022
    pandn       xmm6, xmm4
2023
    movdqa      xmm5, xmm4
2024
    psrlw       xmm4, 8
2025
    packuswb    xmm6, xmm4
2026
    movhlps     xmm4, xmm6
2027
    movhps [r0+r3*2], xmm5
2028
    movhps [r0+r3*1], xmm2
2029
    psrldq      xmm5, 4
2030
    movss       xmm5, xmm6
2031
    psrldq      xmm2, 4
2032
    movss       xmm2, xmm4
2033
    lea           r0, [r2+r3*2]
2034
    psrldq      xmm5, 1
2035
    psrldq      xmm2, 1
2036
    movq        [r0+r3*2], xmm5
2037
    movq        [r0+r3*1], xmm2
2038
    psrldq      xmm5, 1
2039
    psrldq      xmm2, 1
2040
    movq        [r2+r3*2], xmm5
2041
    movq        [r2+r3*1], xmm2
2042
    psrldq      xmm5, 1
2043
    psrldq      xmm2, 1
2044
    movq        [r1+r3*2], xmm5
2045
    movq        [r1+r3*1], xmm2
2046
    RET
2047
%endmacro
2048

    
2049
INIT_MMX
2050
%define PALIGNR PALIGNR_MMX
2051
PRED8x8L_VERTICAL_RIGHT sse2
2052
INIT_MMX
2053
%define PALIGNR PALIGNR_SSSE3
2054
PRED8x8L_VERTICAL_RIGHT ssse3
2055

    
2056
;-----------------------------------------------------------------------------
2057
;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
2058
;-----------------------------------------------------------------------------
2059

    
2060
%macro PRED8x8L_VERTICAL_LEFT 1
2061
cglobal pred8x8l_vertical_left_%1, 4,4
2062
    sub          r0, r3
2063
    movq        mm0, [r0-8]
2064
    movq        mm3, [r0]
2065
    movq        mm1, [r0+8]
2066
    movq        mm2, mm3
2067
    movq        mm4, mm3
2068
    PALIGNR     mm2, mm0, 7, mm0
2069
    PALIGNR     mm1, mm4, 1, mm4
2070
    test         r1, r1
2071
    jz .fix_lt_2
2072
    test         r2, r2
2073
    jz .fix_tr_1
2074
    jmp .do_top
2075
.fix_lt_2:
2076
    movq        mm5, mm3
2077
    pxor        mm5, mm2
2078
    psllq       mm5, 56
2079
    psrlq       mm5, 56
2080
    pxor        mm2, mm5
2081
    test         r2, r2
2082
    jnz .do_top
2083
.fix_tr_1:
2084
    movq        mm5, mm3
2085
    pxor        mm5, mm1
2086
    psrlq       mm5, 56
2087
    psllq       mm5, 56
2088
    pxor        mm1, mm5
2089
    jmp .do_top
2090
.fix_tr_2:
2091
    punpckhbw   mm3, mm3
2092
    pshufw      mm1, mm3, 0xFF
2093
    jmp .do_topright
2094
.do_top:
2095
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2096
    movq2dq    xmm4, mm4
2097
    test         r2, r2
2098
    jz .fix_tr_2
2099
    movq        mm0, [r0+8]
2100
    movq        mm5, mm0
2101
    movq        mm2, mm0
2102
    movq        mm4, mm0
2103
    psrlq       mm5, 56
2104
    PALIGNR     mm2, mm3, 7, mm3
2105
    PALIGNR     mm5, mm4, 1, mm4
2106
    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2107
.do_topright:
2108
    movq2dq   xmm3, mm1
2109
    lea         r1, [r0+r3*2]
2110
    pslldq    xmm3, 8
2111
    por       xmm4, xmm3
2112
    movdqa    xmm2, xmm4
2113
    movdqa    xmm1, xmm4
2114
    movdqa    xmm3, xmm4
2115
    psrldq    xmm2, 1
2116
    pslldq    xmm1, 1
2117
    pavgb     xmm3, xmm2
2118
    lea         r2, [r1+r3*2]
2119
INIT_XMM
2120
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2121
    psrldq    xmm0, 1
2122
    movq [r0+r3*1], xmm3
2123
    movq [r0+r3*2], xmm0
2124
    lea         r0, [r2+r3*2]
2125
    psrldq    xmm3, 1
2126
    psrldq    xmm0, 1
2127
    movq [r1+r3*1], xmm3
2128
    movq [r1+r3*2], xmm0
2129
    psrldq    xmm3, 1
2130
    psrldq    xmm0, 1
2131
    movq [r2+r3*1], xmm3
2132
    movq [r2+r3*2], xmm0
2133
    psrldq    xmm3, 1
2134
    psrldq    xmm0, 1
2135
    movq [r0+r3*1], xmm3
2136
    movq [r0+r3*2], xmm0
2137
    RET
2138
%endmacro
2139

    
2140
INIT_MMX
2141
%define PALIGNR PALIGNR_MMX
2142
PRED8x8L_VERTICAL_LEFT sse2
2143
%define PALIGNR PALIGNR_SSSE3
2144
INIT_MMX
2145
PRED8x8L_VERTICAL_LEFT ssse3
2146

    
2147
;-----------------------------------------------------------------------------
2148
; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
2149
;-----------------------------------------------------------------------------
2150

    
2151
%macro PRED8x8L_HORIZONTAL_UP 1
2152
cglobal pred8x8l_horizontal_up_%1, 4,4
2153
    sub          r0, r3
2154
    lea          r2, [r0+r3*2]
2155
    movq        mm0, [r0+r3*1-8]
2156
    punpckhbw   mm0, [r0+r3*0-8]
2157
    movq        mm1, [r2+r3*1-8]
2158
    punpckhbw   mm1, [r0+r3*2-8]
2159
    mov          r2, r0
2160
    punpckhwd   mm1, mm0
2161
    lea          r0, [r0+r3*4]
2162
    movq        mm2, [r0+r3*1-8]
2163
    punpckhbw   mm2, [r0+r3*0-8]
2164
    lea          r0, [r0+r3*2]
2165
    movq        mm3, [r0+r3*1-8]
2166
    punpckhbw   mm3, [r0+r3*0-8]
2167
    punpckhwd   mm3, mm2
2168
    punpckhdq   mm3, mm1
2169
    lea          r0, [r0+r3*2]
2170
    movq        mm0, [r0+r3*0-8]
2171
    movq        mm1, [r2]
2172
    mov          r0, r2
2173
    movq        mm4, mm3
2174
    movq        mm2, mm3
2175
    PALIGNR     mm4, mm0, 7, mm0
2176
    PALIGNR     mm1, mm2, 1, mm2
2177
    test        r1, r1
2178
    jnz .do_left
2179
.fix_lt_1:
2180
    movq        mm5, mm3
2181
    pxor        mm5, mm4
2182
    psrlq       mm5, 56
2183
    psllq       mm5, 48
2184
    pxor        mm1, mm5
2185
.do_left:
2186
    movq       mm0, mm4
2187
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2188
    movq       mm4, mm0
2189
    movq       mm7, mm2
2190
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2191
    psllq      mm1, 56
2192
    PALIGNR    mm7, mm1, 7, mm3
2193
    lea         r1, [r0+r3*2]
2194
    pshufw     mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2195
    psllq      mm7, 56             ; l7 .. .. .. .. .. .. ..
2196
    movq       mm2, mm0
2197
    psllw      mm0, 8
2198
    psrlw      mm2, 8
2199
    por        mm2, mm0            ; l7 l6 l5 l4 l3 l2 l1 l0
2200
    movq       mm3, mm2
2201
    movq       mm4, mm2
2202
    movq       mm5, mm2
2203
    psrlq      mm2, 8
2204
    psrlq      mm3, 16
2205
    lea         r2, [r1+r3*2]
2206
    por        mm2, mm7            ; l7 l7 l6 l5 l4 l3 l2 l1
2207
    punpckhbw  mm7, mm7
2208
    por        mm3, mm7            ; l7 l7 l7 l6 l5 l4 l3 l2
2209
    pavgb      mm4, mm2
2210
    PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2211
    movq       mm5, mm4
2212
    punpcklbw  mm4, mm1            ; p4 p3 p2 p1
2213
    punpckhbw  mm5, mm1            ; p8 p7 p6 p5
2214
    movq       mm6, mm5
2215
    movq       mm7, mm5
2216
    movq       mm0, mm5
2217
    PALIGNR    mm5, mm4, 2, mm1
2218
    pshufw     mm1, mm6, 11111001b
2219
    PALIGNR    mm6, mm4, 4, mm2
2220
    pshufw     mm2, mm7, 11111110b
2221
    PALIGNR    mm7, mm4, 6, mm3
2222
    pshufw     mm3, mm0, 11111111b
2223
    movq [r0+r3*1], mm4
2224
    movq [r0+r3*2], mm5
2225
    lea         r0, [r2+r3*2]
2226
    movq [r1+r3*1], mm6
2227
    movq [r1+r3*2], mm7
2228
    movq [r2+r3*1], mm0
2229
    movq [r2+r3*2], mm1
2230
    movq [r0+r3*1], mm2
2231
    movq [r0+r3*2], mm3
2232
    RET
2233
%endmacro
2234

    
2235
INIT_MMX
2236
%define PALIGNR PALIGNR_MMX
2237
PRED8x8L_HORIZONTAL_UP mmxext
2238
%define PALIGNR PALIGNR_SSSE3
2239
PRED8x8L_HORIZONTAL_UP ssse3
2240

    
2241
;-----------------------------------------------------------------------------
2242
;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
2243
;-----------------------------------------------------------------------------
2244

    
2245
INIT_MMX
2246
%define PALIGNR PALIGNR_MMX
2247
cglobal pred8x8l_horizontal_down_mmxext, 4,5
2248
    sub          r0, r3
2249
    lea          r4, [r0+r3*2]
2250
    movq        mm0, [r0+r3*1-8]
2251
    punpckhbw   mm0, [r0+r3*0-8]
2252
    movq        mm1, [r4+r3*1-8]
2253
    punpckhbw   mm1, [r0+r3*2-8]
2254
    mov          r4, r0
2255
    punpckhwd   mm1, mm0
2256
    lea          r0, [r0+r3*4]
2257
    movq        mm2, [r0+r3*1-8]
2258
    punpckhbw   mm2, [r0+r3*0-8]
2259
    lea          r0, [r0+r3*2]
2260
    movq        mm3, [r0+r3*1-8]
2261
    punpckhbw   mm3, [r0+r3*0-8]
2262
    punpckhwd   mm3, mm2
2263
    punpckhdq   mm3, mm1
2264
    lea          r0, [r0+r3*2]
2265
    movq        mm0, [r0+r3*0-8]
2266
    movq        mm1, [r4]
2267
    mov          r0, r4
2268
    movq        mm4, mm3
2269
    movq        mm2, mm3
2270
    PALIGNR     mm4, mm0, 7, mm0
2271
    PALIGNR     mm1, mm2, 1, mm2
2272
    test        r1, r1
2273
    jnz .do_left
2274
.fix_lt_1:
2275
    movq        mm5, mm3
2276
    pxor        mm5, mm4
2277
    psrlq       mm5, 56
2278
    psllq       mm5, 48
2279
    pxor        mm1, mm5
2280
    jmp .do_left
2281
.fix_lt_2:
2282
    movq        mm5, mm3
2283
    pxor        mm5, mm2
2284
    psllq       mm5, 56
2285
    psrlq       mm5, 56
2286
    pxor        mm2, mm5
2287
    test         r2, r2
2288
    jnz .do_top
2289
.fix_tr_1:
2290
    movq        mm5, mm3
2291
    pxor        mm5, mm1
2292
    psrlq       mm5, 56
2293
    psllq       mm5, 56
2294
    pxor        mm1, mm5
2295
    jmp .do_top
2296
.do_left:
2297
    movq        mm0, mm4
2298
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2299
    movq        mm4, mm0
2300
    movq        mm7, mm2
2301
    movq        mm6, mm2
2302
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2303
    psllq       mm1, 56
2304
    PALIGNR     mm7, mm1, 7, mm3
2305
    movq        mm0, [r0-8]
2306
    movq        mm3, [r0]
2307
    movq        mm1, [r0+8]
2308
    movq        mm2, mm3
2309
    movq        mm4, mm3
2310
    PALIGNR     mm2, mm0, 7, mm0
2311
    PALIGNR     mm1, mm4, 1, mm4
2312
    test         r1, r1
2313
    jz .fix_lt_2
2314
    test         r2, r2
2315
    jz .fix_tr_1
2316
.do_top:
2317
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2318
    movq       mm5, mm4
2319
    lea         r1, [r0+r3*2]
2320
    psllq      mm7, 56
2321
    movq       mm2, mm5
2322
    movq       mm3, mm6
2323
    movq       mm4, mm2
2324
    PALIGNR    mm2, mm6, 7, mm5
2325
    PALIGNR    mm6, mm7, 7, mm0
2326
    lea         r2, [r1+r3*2]
2327
    PALIGNR    mm4, mm3, 1, mm7
2328
    movq       mm5, mm3
2329
    pavgb      mm3, mm6
2330
    PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2331
    movq       mm4, mm2
2332
    movq       mm1, mm2
2333
    lea         r4, [r2+r3*2]
2334
    psrlq      mm4, 16
2335
    psrlq      mm1, 8
2336
    PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2337
    movq       mm7, mm3
2338
    punpcklbw  mm3, mm0
2339
    punpckhbw  mm7, mm0
2340
    movq       mm1, mm7
2341
    movq       mm0, mm7
2342
    movq       mm4, mm7
2343
    movq [r4+r3*2], mm3
2344
    PALIGNR    mm7, mm3, 2, mm5
2345
    movq [r4+r3*1], mm7
2346
    PALIGNR    mm1, mm3, 4, mm5
2347
    movq [r2+r3*2], mm1
2348
    PALIGNR    mm0, mm3, 6, mm3
2349
    movq [r2+r3*1], mm0
2350
    movq       mm2, mm6
2351
    movq       mm3, mm6
2352
    movq [r1+r3*2], mm4
2353
    PALIGNR    mm6, mm4, 2, mm5
2354
    movq [r1+r3*1], mm6
2355
    PALIGNR    mm2, mm4, 4, mm5
2356
    movq [r0+r3*2], mm2
2357
    PALIGNR    mm3, mm4, 6, mm4
2358
    movq [r0+r3*1], mm3
2359
    RET
2360

    
2361
%macro PRED8x8L_HORIZONTAL_DOWN 1
2362
cglobal pred8x8l_horizontal_down_%1, 4,5
2363
    sub          r0, r3
2364
    lea          r4, [r0+r3*2]
2365
    movq        mm0, [r0+r3*1-8]
2366
    punpckhbw   mm0, [r0+r3*0-8]
2367
    movq        mm1, [r4+r3*1-8]
2368
    punpckhbw   mm1, [r0+r3*2-8]
2369
    mov          r4, r0
2370
    punpckhwd   mm1, mm0
2371
    lea          r0, [r0+r3*4]
2372
    movq        mm2, [r0+r3*1-8]
2373
    punpckhbw   mm2, [r0+r3*0-8]
2374
    lea          r0, [r0+r3*2]
2375
    movq        mm3, [r0+r3*1-8]
2376
    punpckhbw   mm3, [r0+r3*0-8]
2377
    punpckhwd   mm3, mm2
2378
    punpckhdq   mm3, mm1
2379
    lea          r0, [r0+r3*2]
2380
    movq        mm0, [r0+r3*0-8]
2381
    movq        mm1, [r4]
2382
    mov          r0, r4
2383
    movq        mm4, mm3
2384
    movq        mm2, mm3
2385
    PALIGNR     mm4, mm0, 7, mm0
2386
    PALIGNR     mm1, mm2, 1, mm2
2387
    test        r1, r1
2388
    jnz .do_left
2389
.fix_lt_1:
2390
    movq        mm5, mm3
2391
    pxor        mm5, mm4
2392
    psrlq       mm5, 56
2393
    psllq       mm5, 48
2394
    pxor        mm1, mm5
2395
    jmp .do_left
2396
.fix_lt_2:
2397
    movq        mm5, mm3
2398
    pxor        mm5, mm2
2399
    psllq       mm5, 56
2400
    psrlq       mm5, 56
2401
    pxor        mm2, mm5
2402
    test         r2, r2
2403
    jnz .do_top
2404
.fix_tr_1:
2405
    movq        mm5, mm3
2406
    pxor        mm5, mm1
2407
    psrlq       mm5, 56
2408
    psllq       mm5, 56
2409
    pxor        mm1, mm5
2410
    jmp .do_top
2411
.fix_tr_2:
2412
    punpckhbw   mm3, mm3
2413
    pshufw      mm1, mm3, 0xFF
2414
    jmp .do_topright
2415
.do_left:
2416
    movq        mm0, mm4
2417
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2418
    movq2dq    xmm0, mm2
2419
    pslldq     xmm0, 8
2420
    movq        mm4, mm0
2421
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2422
    movq2dq    xmm2, mm1
2423
    pslldq     xmm2, 15
2424
    psrldq     xmm2, 8
2425
    por        xmm0, xmm2
2426
    movq        mm0, [r0-8]
2427
    movq        mm3, [r0]
2428
    movq        mm1, [r0+8]
2429
    movq        mm2, mm3
2430
    movq        mm4, mm3
2431
    PALIGNR     mm2, mm0, 7, mm0
2432
    PALIGNR     mm1, mm4, 1, mm4
2433
    test         r1, r1
2434
    jz .fix_lt_2
2435
    test         r2, r2
2436
    jz .fix_tr_1
2437
.do_top:
2438
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2439
    movq2dq    xmm1, mm4
2440
    test         r2, r2
2441
    jz .fix_tr_2
2442
    movq        mm0, [r0+8]
2443
    movq        mm5, mm0
2444
    movq        mm2, mm0
2445
    movq        mm4, mm0
2446
    psrlq       mm5, 56
2447
    PALIGNR     mm2, mm3, 7, mm3
2448
    PALIGNR     mm5, mm4, 1, mm4
2449
    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2450
.do_topright:
2451
    movq2dq    xmm5, mm1
2452
    pslldq     xmm5, 8
2453
    por        xmm1, xmm5
2454
INIT_XMM
2455
    lea         r2, [r4+r3*2]
2456
    movdqa    xmm2, xmm1
2457
    movdqa    xmm3, xmm1
2458
    PALIGNR   xmm1, xmm0, 7, xmm4
2459
    PALIGNR   xmm2, xmm0, 9, xmm5
2460
    lea         r1, [r2+r3*2]
2461
    PALIGNR   xmm3, xmm0, 8, xmm0
2462
    movdqa    xmm4, xmm1
2463
    pavgb     xmm4, xmm3
2464
    lea         r0, [r1+r3*2]
2465
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2466
    punpcklbw xmm4, xmm0
2467
    movhlps   xmm0, xmm4
2468
    movq   [r0+r3*2], xmm4
2469
    movq   [r2+r3*2], xmm0
2470
    psrldq xmm4, 2
2471
    psrldq xmm0, 2
2472
    movq   [r0+r3*1], xmm4
2473
    movq   [r2+r3*1], xmm0
2474
    psrldq xmm4, 2
2475
    psrldq xmm0, 2
2476
    movq   [r1+r3*2], xmm4
2477
    movq   [r4+r3*2], xmm0
2478
    psrldq xmm4, 2
2479
    psrldq xmm0, 2
2480
    movq   [r1+r3*1], xmm4
2481
    movq   [r4+r3*1], xmm0
2482
    RET
2483
%endmacro
2484

    
2485
INIT_MMX
2486
%define PALIGNR PALIGNR_MMX
2487
PRED8x8L_HORIZONTAL_DOWN sse2
2488
INIT_MMX
2489
%define PALIGNR PALIGNR_SSSE3
2490
PRED8x8L_HORIZONTAL_DOWN ssse3
2491
%endif
2492

    
2493
;-----------------------------------------------------------------------------
2494
; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2495
;-----------------------------------------------------------------------------
2496

    
2497
cglobal pred4x4_dc_mmxext, 3,5
2498
    pxor   mm7, mm7
2499
    mov     r4, r0
2500
    sub     r0, r2
2501
    movd   mm0, [r0]
2502
    psadbw mm0, mm7
2503
    movzx  r1d, byte [r0+r2*1-1]
2504
    movd   r3d, mm0
2505
    add    r3d, r1d
2506
    movzx  r1d, byte [r0+r2*2-1]
2507
    lea     r0, [r0+r2*2]
2508
    add    r3d, r1d
2509
    movzx  r1d, byte [r0+r2*1-1]
2510
    add    r3d, r1d
2511
    movzx  r1d, byte [r0+r2*2-1]
2512
    add    r3d, r1d
2513
    add    r3d, 4
2514
    shr    r3d, 3
2515
    imul   r3d, 0x01010101
2516
    mov   [r4+r2*0], r3d
2517
    mov   [r0+r2*0], r3d
2518
    mov   [r0+r2*1], r3d
2519
    mov   [r0+r2*2], r3d
2520
    RET
2521

    
2522
;-----------------------------------------------------------------------------
2523
; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2524
;-----------------------------------------------------------------------------
2525

    
2526
%macro PRED4x4_TM_MMX 1
2527
cglobal pred4x4_tm_vp8_%1, 3,6
2528
    sub        r0, r2
2529
    pxor      mm7, mm7
2530
    movd      mm0, [r0]
2531
    punpcklbw mm0, mm7
2532
    movzx     r4d, byte [r0-1]
2533
    mov       r5d, 2
2534
.loop:
2535
    movzx     r1d, byte [r0+r2*1-1]
2536
    movzx     r3d, byte [r0+r2*2-1]
2537
    sub       r1d, r4d
2538
    sub       r3d, r4d
2539
    movd      mm2, r1d
2540
    movd      mm4, r3d
2541
%ifidn %1, mmx
2542
    punpcklwd mm2, mm2
2543
    punpcklwd mm4, mm4
2544
    punpckldq mm2, mm2
2545
    punpckldq mm4, mm4
2546
%else
2547
    pshufw    mm2, mm2, 0
2548
    pshufw    mm4, mm4, 0
2549
%endif
2550
    paddw     mm2, mm0
2551
    paddw     mm4, mm0
2552
    packuswb  mm2, mm2
2553
    packuswb  mm4, mm4
2554
    movd [r0+r2*1], mm2
2555
    movd [r0+r2*2], mm4
2556
    lea        r0, [r0+r2*2]
2557
    dec       r5d
2558
    jg .loop
2559
    REP_RET
2560
%endmacro
2561

    
2562
PRED4x4_TM_MMX mmx
2563
PRED4x4_TM_MMX mmxext
2564

    
2565
cglobal pred4x4_tm_vp8_ssse3, 3,3
2566
    sub         r0, r2
2567
    movq       mm6, [tm_shuf]
2568
    pxor       mm1, mm1
2569
    movd       mm0, [r0]
2570
    punpcklbw  mm0, mm1
2571
    movd       mm7, [r0-4]
2572
    pshufb     mm7, mm6
2573
    lea         r1, [r0+r2*2]
2574
    movd       mm2, [r0+r2*1-4]
2575
    movd       mm3, [r0+r2*2-4]
2576
    movd       mm4, [r1+r2*1-4]
2577
    movd       mm5, [r1+r2*2-4]
2578
    pshufb     mm2, mm6
2579
    pshufb     mm3, mm6
2580
    pshufb     mm4, mm6
2581
    pshufb     mm5, mm6
2582
    psubw      mm2, mm7
2583
    psubw      mm3, mm7
2584
    psubw      mm4, mm7
2585
    psubw      mm5, mm7
2586
    paddw      mm2, mm0
2587
    paddw      mm3, mm0
2588
    paddw      mm4, mm0
2589
    paddw      mm5, mm0
2590
    packuswb   mm2, mm2
2591
    packuswb   mm3, mm3
2592
    packuswb   mm4, mm4
2593
    packuswb   mm5, mm5
2594
    movd [r0+r2*1], mm2
2595
    movd [r0+r2*2], mm3
2596
    movd [r1+r2*1], mm4
2597
    movd [r1+r2*2], mm5
2598
    RET
2599

    
2600
;-----------------------------------------------------------------------------
2601
; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2602
;-----------------------------------------------------------------------------
2603

    
2604
INIT_MMX
2605
cglobal pred4x4_vertical_vp8_mmxext, 3,3
2606
    sub       r0, r2
2607
    movd      m1, [r0-1]
2608
    movd      m0, [r0]
2609
    mova      m2, m0   ;t0 t1 t2 t3
2610
    punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2611
    lea       r1, [r0+r2*2]
2612
    psrlq     m0, 8    ;t1 t2 t3 t4
2613
    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2614
    movd [r0+r2*1], m3
2615
    movd [r0+r2*2], m3
2616
    movd [r1+r2*1], m3
2617
    movd [r1+r2*2], m3
2618
    RET
2619

    
2620
;-----------------------------------------------------------------------------
2621
; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2622
;-----------------------------------------------------------------------------
2623
%ifdef CONFIG_GPL
2624
INIT_MMX
2625
cglobal pred4x4_down_left_mmxext, 3,3
2626
    sub       r0, r2
2627
    movq      m1, [r0]
2628
    punpckldq m1, [r1]
2629
    movq      m2, m1
2630
    movq      m3, m1
2631
    movq      m4, m1
2632
    psllq     m1, 8
2633
    pxor      m2, m1
2634
    psrlq     m2, 8
2635
    pxor      m3, m2
2636
    PRED4x4_LOWPASS m0, m1, m3, m4, m5
2637
    lea       r1, [r0+r2*2]
2638
    psrlq     m0, 8
2639
    movd      [r0+r2*1], m0
2640
    psrlq     m0, 8
2641
    movd      [r0+r2*2], m0
2642
    psrlq     m0, 8
2643
    movd      [r1+r2*1], m0
2644
    psrlq     m0, 8
2645
    movd      [r1+r2*2], m0
2646
    RET
2647

    
2648
;-----------------------------------------------------------------------------
2649
; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2650
;-----------------------------------------------------------------------------
2651

    
2652
INIT_MMX
2653
cglobal pred4x4_vertical_left_mmxext, 3,3
2654
    sub       r0, r2
2655
    movq      m1, [r0]
2656
    punpckldq m1, [r1]
2657
    movq      m3, m1
2658
    movq      m2, m1
2659
    psrlq     m3, 8
2660
    psrlq     m2, 16
2661
    movq      m4, m3
2662
    pavgb     m4, m1
2663
    PRED4x4_LOWPASS m0, m1, m2, m3, m5
2664
    lea       r1, [r0+r2*2]
2665
    movh      [r0+r2*1], m4
2666
    movh      [r0+r2*2], m0
2667
    psrlq     m4, 8
2668
    psrlq     m0, 8
2669
    movh      [r1+r2*1], m4
2670
    movh      [r1+r2*2], m0
2671
    RET
2672

    
2673
;-----------------------------------------------------------------------------
2674
; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2675
;-----------------------------------------------------------------------------
2676

    
2677
INIT_MMX
2678
cglobal pred4x4_horizontal_up_mmxext, 3,3
2679
    sub       r0, r2
2680
    lea       r1, [r0+r2*2]
2681
    movd      m0, [r0+r2*1-4]
2682
    punpcklbw m0, [r0+r2*2-4]
2683
    movd      m1, [r1+r2*1-4]
2684
    punpcklbw m1, [r1+r2*2-4]
2685
    punpckhwd m0, m1
2686
    movq      m1, m0
2687
    punpckhbw m1, m1
2688
    pshufw    m1, m1, 0xFF
2689
    punpckhdq m0, m1
2690
    movq      m2, m0
2691
    movq      m3, m0
2692
    movq      m7, m0
2693
    psrlq     m2, 16
2694
    psrlq     m3, 8
2695
    pavgb     m7, m3
2696
    PRED4x4_LOWPASS m4, m0, m2, m3, m5
2697
    punpcklbw m7, m4
2698
    movd    [r0+r2*1], m7
2699
    psrlq    m7, 16
2700
    movd    [r0+r2*2], m7
2701
    psrlq    m7, 16
2702
    movd    [r1+r2*1], m7
2703
    movd    [r1+r2*2], m1
2704
    RET
2705

    
2706
;-----------------------------------------------------------------------------
2707
; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2708
;-----------------------------------------------------------------------------
2709

    
2710
INIT_MMX
2711
%define PALIGNR PALIGNR_MMX
2712
cglobal pred4x4_horizontal_down_mmxext, 3,3
2713
    sub       r0, r2
2714
    lea       r1, [r0+r2*2]
2715
    movh      m0, [r0-4]      ; lt ..
2716
    punpckldq m0, [r0]        ; t3 t2 t1 t0 lt .. .. ..
2717
    psllq     m0, 8           ; t2 t1 t0 lt .. .. .. ..
2718
    movd      m1, [r1+r2*2-4] ; l3
2719
    punpcklbw m1, [r1+r2*1-4] ; l2 l3
2720
    movd      m2, [r0+r2*2-4] ; l1
2721
    punpcklbw m2, [r0+r2*1-4] ; l0 l1
2722
    punpckhwd m1, m2          ; l0 l1 l2 l3
2723
    punpckhdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
2724
    movq      m0, m1
2725
    movq      m2, m1
2726
    movq      m5, m1
2727
    psrlq     m0, 16          ; .. .. t2 t1 t0 lt l0 l1
2728
    psrlq     m2, 8           ; .. t2 t1 t0 lt l0 l1 l2
2729
    pavgb     m5, m2
2730
    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2731
    punpcklbw m5, m3
2732
    psrlq     m3, 32
2733
    PALIGNR   m3, m5, 6, m4
2734
    movh      [r1+r2*2], m5
2735
    psrlq     m5, 16
2736
    movh      [r1+r2*1], m5
2737
    psrlq     m5, 16
2738
    movh      [r0+r2*2], m5
2739
    movh      [r0+r2*1], m3
2740
    RET
2741

    
2742
;-----------------------------------------------------------------------------
2743
; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2744
;-----------------------------------------------------------------------------
2745

    
2746
INIT_MMX
2747
%define PALIGNR PALIGNR_MMX
2748
cglobal pred4x4_vertical_right_mmxext, 3,3
2749
    sub     r0, r2
2750
    lea     r1, [r0+r2*2]
2751
    movh    m0, [r0]                    ; ........t3t2t1t0
2752
    movq    m5, m0
2753
    PALIGNR m0, [r0-8], 7, m1           ; ......t3t2t1t0lt
2754
    pavgb   m5, m0
2755
    PALIGNR m0, [r0+r2*1-8], 7, m1      ; ....t3t2t1t0ltl0
2756
    movq    m1, m0
2757
    PALIGNR m0, [r0+r2*2-8], 7, m2      ; ..t3t2t1t0ltl0l1
2758
    movq    m2, m0
2759
    PALIGNR m0, [r1+r2*1-8], 7, m3      ; t3t2t1t0ltl0l1l2
2760
    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2761
    movq    m1, m3
2762
    psrlq   m3, 16
2763
    psllq   m1, 48
2764
    movh    [r0+r2*1], m5
2765
    movh    [r0+r2*2], m3
2766
    PALIGNR m5, m1, 7, m2
2767
    psllq   m1, 8
2768
    movh    [r1+r2*1], m5
2769
    PALIGNR m3, m1, 7, m1
2770
    movh    [r1+r2*2], m3
2771
    RET
2772

    
2773
;-----------------------------------------------------------------------------
2774
; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2775
;-----------------------------------------------------------------------------
2776

    
2777
INIT_MMX
2778
%define PALIGNR PALIGNR_MMX
2779
cglobal pred4x4_down_right_mmxext, 3,3
2780
    sub       r0, r2
2781
    lea       r1, [r0+r2*2]
2782
    movq      m1, [r1-8]
2783
    movq      m2, [r0+r2*1-8]
2784
    punpckhbw m2, [r0-8]
2785
    movh      m3, [r0]
2786
    punpckhwd m1, m2
2787
    PALIGNR   m3, m1, 5, m1
2788
    movq      m1, m3
2789
    PALIGNR   m3, [r1+r2*1-8], 7, m4
2790
    movq      m2, m3
2791
    PALIGNR   m3, [r1+r2*2-8], 7, m4
2792
    PRED4x4_LOWPASS m0, m3, m1, m2, m4
2793
    movh      [r1+r2*2], m0
2794
    psrlq     m0, 8
2795
    movh      [r1+r2*1], m0
2796
    psrlq     m0, 8
2797
    movh      [r0+r2*2], m0
2798
    psrlq     m0, 8
2799
    movh      [r0+r2*1], m0
2800
    RET
2801
%endif