Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_intrapred.asm @ b9c7f66e

History | View | Annotate | Download (70.5 KB)

1
;******************************************************************************
2
;* H.264 intra prediction asm optimizations
3
;* Copyright (c) 2010 Jason Garrett-Glaser
4
;* Copyright (c) 2010 Holger Lubitz
5
;* Copyright (c) 2010 Loren Merritt
6
;* Copyright (c) 2010 Ronald S. Bultje
7
;*
8
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or
11
;* modify it under the terms of the GNU Lesser General Public
12
;* License as published by the Free Software Foundation; either
13
;* version 2.1 of the License, or (at your option) any later version.
14
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
;* Lesser General Public License for more details.
19
;*
20
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with FFmpeg; if not, write to the Free Software
22
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
;******************************************************************************
24

    
25
%include "x86inc.asm"
26
%include "x86util.asm"
27

    
28
SECTION_RODATA
29

    
30
tm_shuf: times 8 db 0x03, 0x80
31
pw_ff00: times 8 dw 0xff00
32
plane_shuf:  db -8, -7, -6, -5, -4, -3, -2, -1
33
             db  1,  2,  3,  4,  5,  6,  7,  8
34
plane8_shuf: db -4, -3, -2, -1,  0,  0,  0,  0
35
             db  1,  2,  3,  4,  0,  0,  0,  0
36
pw_0to7:     dw  0,  1,  2,  3,  4,  5,  6,  7
37
pw_1to8:     dw  1,  2,  3,  4,  5,  6,  7,  8
38
pw_m8tom1:   dw -8, -7, -6, -5, -4, -3, -2, -1
39
pw_m4to4:    dw -4, -3, -2, -1,  1,  2,  3,  4
40

    
41
SECTION .text
42

    
43
cextern pb_1
44
cextern pb_3
45
cextern pw_4
46
cextern pw_5
47
cextern pw_8
48
cextern pw_16
49
cextern pw_17
50
cextern pw_32
51

    
52
;-----------------------------------------------------------------------------
53
; void pred16x16_vertical(uint8_t *src, int stride)
54
;-----------------------------------------------------------------------------
55

    
56
cglobal pred16x16_vertical_mmx, 2,3
57
    sub   r0, r1
58
    mov   r2, 8
59
    movq mm0, [r0+0]
60
    movq mm1, [r0+8]
61
.loop:
62
    movq [r0+r1*1+0], mm0
63
    movq [r0+r1*1+8], mm1
64
    movq [r0+r1*2+0], mm0
65
    movq [r0+r1*2+8], mm1
66
    lea   r0, [r0+r1*2]
67
    dec   r2
68
    jg .loop
69
    REP_RET
70

    
71
cglobal pred16x16_vertical_sse, 2,3
72
    sub   r0, r1
73
    mov   r2, 4
74
    movaps xmm0, [r0]
75
.loop:
76
    movaps [r0+r1*1], xmm0
77
    movaps [r0+r1*2], xmm0
78
    lea   r0, [r0+r1*2]
79
    movaps [r0+r1*1], xmm0
80
    movaps [r0+r1*2], xmm0
81
    lea   r0, [r0+r1*2]
82
    dec   r2
83
    jg .loop
84
    REP_RET
85

    
86
;-----------------------------------------------------------------------------
87
; void pred16x16_horizontal(uint8_t *src, int stride)
88
;-----------------------------------------------------------------------------
89

    
90
%macro PRED16x16_H 1
91
cglobal pred16x16_horizontal_%1, 2,3
92
    mov       r2, 8
93
%ifidn %1, ssse3
94
    mova      m2, [pb_3]
95
%endif
96
.loop:
97
    movd      m0, [r0+r1*0-4]
98
    movd      m1, [r0+r1*1-4]
99

    
100
%ifidn %1, ssse3
101
    pshufb    m0, m2
102
    pshufb    m1, m2
103
%else
104
    punpcklbw m0, m0
105
    punpcklbw m1, m1
106
%ifidn %1, mmxext
107
    pshufw    m0, m0, 0xff
108
    pshufw    m1, m1, 0xff
109
%else
110
    punpckhwd m0, m0
111
    punpckhwd m1, m1
112
    punpckhdq m0, m0
113
    punpckhdq m1, m1
114
%endif
115
    mova [r0+r1*0+8], m0
116
    mova [r0+r1*1+8], m1
117
%endif
118

    
119
    mova [r0+r1*0], m0
120
    mova [r0+r1*1], m1
121
    lea       r0, [r0+r1*2]
122
    dec       r2
123
    jg .loop
124
    REP_RET
125
%endmacro
126

    
127
INIT_MMX
128
PRED16x16_H mmx
129
PRED16x16_H mmxext
130
INIT_XMM
131
PRED16x16_H ssse3
132

    
133
;-----------------------------------------------------------------------------
134
; void pred16x16_dc(uint8_t *src, int stride)
135
;-----------------------------------------------------------------------------
136

    
137
%macro PRED16x16_DC 1
138
cglobal pred16x16_dc_%1, 2,7
139
    mov       r4, r0
140
    sub       r0, r1
141
    pxor      mm0, mm0
142
    pxor      mm1, mm1
143
    psadbw    mm0, [r0+0]
144
    psadbw    mm1, [r0+8]
145
    dec        r0
146
    movzx     r5d, byte [r0+r1*1]
147
    paddw     mm0, mm1
148
    movd      r6d, mm0
149
    lea        r0, [r0+r1*2]
150
%rep 7
151
    movzx     r2d, byte [r0+r1*0]
152
    movzx     r3d, byte [r0+r1*1]
153
    add       r5d, r2d
154
    add       r6d, r3d
155
    lea        r0, [r0+r1*2]
156
%endrep
157
    movzx     r2d, byte [r0+r1*0]
158
    add       r5d, r6d
159
    lea       r2d, [r2+r5+16]
160
    shr       r2d, 5
161
%ifidn %1, mmxext
162
    movd       m0, r2d
163
    punpcklbw  m0, m0
164
    pshufw     m0, m0, 0
165
%elifidn %1, sse2
166
    movd       m0, r2d
167
    punpcklbw  m0, m0
168
    pshuflw    m0, m0, 0
169
    punpcklqdq m0, m0
170
%elifidn %1, ssse3
171
    pxor       m1, m1
172
    movd       m0, r2d
173
    pshufb     m0, m1
174
%endif
175

    
176
%if mmsize==8
177
    mov       r3d, 8
178
.loop:
179
    mova [r4+r1*0+0], m0
180
    mova [r4+r1*0+8], m0
181
    mova [r4+r1*1+0], m0
182
    mova [r4+r1*1+8], m0
183
%else
184
    mov       r3d, 4
185
.loop:
186
    mova [r4+r1*0], m0
187
    mova [r4+r1*1], m0
188
    lea   r4, [r4+r1*2]
189
    mova [r4+r1*0], m0
190
    mova [r4+r1*1], m0
191
%endif
192
    lea   r4, [r4+r1*2]
193
    dec   r3d
194
    jg .loop
195
    REP_RET
196
%endmacro
197

    
198
INIT_MMX
199
PRED16x16_DC mmxext
200
INIT_XMM
201
PRED16x16_DC   sse2
202
PRED16x16_DC  ssse3
203

    
204
;-----------------------------------------------------------------------------
205
; void pred16x16_tm_vp8(uint8_t *src, int stride)
206
;-----------------------------------------------------------------------------
207

    
208
%macro PRED16x16_TM_MMX 1
209
cglobal pred16x16_tm_vp8_%1, 2,5
210
    sub        r0, r1
211
    pxor      mm7, mm7
212
    movq      mm0, [r0+0]
213
    movq      mm2, [r0+8]
214
    movq      mm1, mm0
215
    movq      mm3, mm2
216
    punpcklbw mm0, mm7
217
    punpckhbw mm1, mm7
218
    punpcklbw mm2, mm7
219
    punpckhbw mm3, mm7
220
    movzx     r3d, byte [r0-1]
221
    mov       r4d, 16
222
.loop:
223
    movzx     r2d, byte [r0+r1-1]
224
    sub       r2d, r3d
225
    movd      mm4, r2d
226
%ifidn %1, mmx
227
    punpcklwd mm4, mm4
228
    punpckldq mm4, mm4
229
%else
230
    pshufw    mm4, mm4, 0
231
%endif
232
    movq      mm5, mm4
233
    movq      mm6, mm4
234
    movq      mm7, mm4
235
    paddw     mm4, mm0
236
    paddw     mm5, mm1
237
    paddw     mm6, mm2
238
    paddw     mm7, mm3
239
    packuswb  mm4, mm5
240
    packuswb  mm6, mm7
241
    movq [r0+r1+0], mm4
242
    movq [r0+r1+8], mm6
243
    add        r0, r1
244
    dec       r4d
245
    jg .loop
246
    REP_RET
247
%endmacro
248

    
249
PRED16x16_TM_MMX mmx
250
PRED16x16_TM_MMX mmxext
251

    
252
cglobal pred16x16_tm_vp8_sse2, 2,6,6
253
    sub          r0, r1
254
    pxor       xmm2, xmm2
255
    movdqa     xmm0, [r0]
256
    movdqa     xmm1, xmm0
257
    punpcklbw  xmm0, xmm2
258
    punpckhbw  xmm1, xmm2
259
    movzx       r4d, byte [r0-1]
260
    mov         r5d, 8
261
.loop:
262
    movzx       r2d, byte [r0+r1*1-1]
263
    movzx       r3d, byte [r0+r1*2-1]
264
    sub         r2d, r4d
265
    sub         r3d, r4d
266
    movd       xmm2, r2d
267
    movd       xmm4, r3d
268
    pshuflw    xmm2, xmm2, 0
269
    pshuflw    xmm4, xmm4, 0
270
    punpcklqdq xmm2, xmm2
271
    punpcklqdq xmm4, xmm4
272
    movdqa     xmm3, xmm2
273
    movdqa     xmm5, xmm4
274
    paddw      xmm2, xmm0
275
    paddw      xmm3, xmm1
276
    paddw      xmm4, xmm0
277
    paddw      xmm5, xmm1
278
    packuswb   xmm2, xmm3
279
    packuswb   xmm4, xmm5
280
    movdqa [r0+r1*1], xmm2
281
    movdqa [r0+r1*2], xmm4
282
    lea          r0, [r0+r1*2]
283
    dec         r5d
284
    jg .loop
285
    REP_RET
286

    
287
;-----------------------------------------------------------------------------
288
; void pred16x16_plane(uint8_t *src, int stride)
289
;-----------------------------------------------------------------------------
290

    
291
%macro H264_PRED16x16_PLANE 3
292
cglobal pred16x16_plane_%3_%1, 2, 7, %2
293
    mov          r2, r1           ; +stride
294
    neg          r1               ; -stride
295

    
296
    movh         m0, [r0+r1  -1]
297
%if mmsize == 8
298
    pxor         m4, m4
299
    movh         m1, [r0+r1  +3 ]
300
    movh         m2, [r0+r1  +8 ]
301
    movh         m3, [r0+r1  +12]
302
    punpcklbw    m0, m4
303
    punpcklbw    m1, m4
304
    punpcklbw    m2, m4
305
    punpcklbw    m3, m4
306
    pmullw       m0, [pw_m8tom1  ]
307
    pmullw       m1, [pw_m8tom1+8]
308
    pmullw       m2, [pw_1to8    ]
309
    pmullw       m3, [pw_1to8  +8]
310
    paddw        m0, m2
311
    paddw        m1, m3
312
%else ; mmsize == 16
313
%ifidn %1, sse2
314
    pxor         m2, m2
315
    movh         m1, [r0+r1  +8]
316
    punpcklbw    m0, m2
317
    punpcklbw    m1, m2
318
    pmullw       m0, [pw_m8tom1]
319
    pmullw       m1, [pw_1to8]
320
    paddw        m0, m1
321
%else ; ssse3
322
    movhps       m0, [r0+r1  +8]
323
    pmaddubsw    m0, [plane_shuf] ; H coefficients
324
%endif
325
    movhlps      m1, m0
326
%endif
327
    paddw        m0, m1
328
%ifidn %1, mmx
329
    mova         m1, m0
330
    psrlq        m1, 32
331
%elifidn %1, mmx2
332
    pshufw       m1, m0, 0xE
333
%else ; mmsize == 16
334
    pshuflw      m1, m0, 0xE
335
%endif
336
    paddw        m0, m1
337
%ifidn %1, mmx
338
    mova         m1, m0
339
    psrlq        m1, 16
340
%elifidn %1, mmx2
341
    pshufw       m1, m0, 0x1
342
%else
343
    pshuflw      m1, m0, 0x1
344
%endif
345
    paddw        m0, m1           ; sum of H coefficients
346

    
347
    lea          r4, [r0+r2*8-1]
348
    lea          r3, [r0+r2*4-1]
349
    add          r4, r2
350

    
351
%ifdef ARCH_X86_64
352
%define e_reg r11
353
%else
354
%define e_reg r0
355
%endif
356

    
357
    movzx     e_reg, byte [r3+r2*2   ]
358
    movzx        r5, byte [r4+r1     ]
359
    sub          r5, e_reg
360

    
361
    movzx     e_reg, byte [r3+r2     ]
362
    movzx        r6, byte [r4        ]
363
    sub          r6, e_reg
364
    lea          r5, [r5+r6*2]
365

    
366
    movzx     e_reg, byte [r3+r1     ]
367
    movzx        r6, byte [r4+r2*2   ]
368
    sub          r6, e_reg
369
    lea          r5, [r5+r6*4]
370

    
371
    movzx     e_reg, byte [r3        ]
372
%ifdef ARCH_X86_64
373
    movzx       r10, byte [r4+r2     ]
374
    sub         r10, e_reg
375
%else
376
    movzx        r6, byte [r4+r2     ]
377
    sub          r6, e_reg
378
    lea          r5, [r5+r6*4]
379
    sub          r5, r6
380
%endif
381

    
382
    lea       e_reg, [r3+r1*4]
383
    lea          r3, [r4+r2*4]
384

    
385
    movzx        r4, byte [e_reg+r2  ]
386
    movzx        r6, byte [r3        ]
387
    sub          r6, r4
388
%ifdef ARCH_X86_64
389
    lea          r6, [r10+r6*2]
390
    lea          r5, [r5+r6*2]
391
    add          r5, r6
392
%else
393
    lea          r5, [r5+r6*4]
394
    lea          r5, [r5+r6*2]
395
%endif
396

    
397
    movzx        r4, byte [e_reg     ]
398
%ifdef ARCH_X86_64
399
    movzx       r10, byte [r3   +r2  ]
400
    sub         r10, r4
401
    sub          r5, r10
402
%else
403
    movzx        r6, byte [r3   +r2  ]
404
    sub          r6, r4
405
    lea          r5, [r5+r6*8]
406
    sub          r5, r6
407
%endif
408

    
409
    movzx        r4, byte [e_reg+r1  ]
410
    movzx        r6, byte [r3   +r2*2]
411
    sub          r6, r4
412
%ifdef ARCH_X86_64
413
    add          r6, r10
414
%endif
415
    lea          r5, [r5+r6*8]
416

    
417
    movzx        r4, byte [e_reg+r2*2]
418
    movzx        r6, byte [r3   +r1  ]
419
    sub          r6, r4
420
    lea          r5, [r5+r6*4]
421
    add          r5, r6           ; sum of V coefficients
422

    
423
%ifndef ARCH_X86_64
424
    mov          r0, r0m
425
%endif
426

    
427
%ifidn %3, h264
428
    lea          r5, [r5*5+32]
429
    sar          r5, 6
430
%elifidn %3, rv40
431
    lea          r5, [r5*5]
432
    sar          r5, 6
433
%elifidn %3, svq3
434
    test         r5, r5
435
    lea          r6, [r5+3]
436
    cmovs        r5, r6
437
    sar          r5, 2            ; V/4
438
    lea          r5, [r5*5]       ; 5*(V/4)
439
    test         r5, r5
440
    lea          r6, [r5+15]
441
    cmovs        r5, r6
442
    sar          r5, 4            ; (5*(V/4))/16
443
%endif
444

    
445
    movzx        r4, byte [r0+r1  +15]
446
    movzx        r3, byte [r3+r2*2   ]
447
    lea          r3, [r3+r4+1]
448
    shl          r3, 4
449

    
450
    movd        r1d, m0
451
    movsx       r1d, r1w
452
%ifnidn %3, svq3
453
%ifidn %3, h264
454
    lea         r1d, [r1d*5+32]
455
%else ; rv40
456
    lea         r1d, [r1d*5]
457
%endif
458
    sar         r1d, 6
459
%else ; svq3
460
    test        r1d, r1d
461
    lea         r4d, [r1d+3]
462
    cmovs       r1d, r4d
463
    sar         r1d, 2           ; H/4
464
    lea         r1d, [r1d*5]     ; 5*(H/4)
465
    test        r1d, r1d
466
    lea         r4d, [r1d+15]
467
    cmovs       r1d, r4d
468
    sar         r1d, 4           ; (5*(H/4))/16
469
%endif
470
    movd         m0, r1d
471

    
472
    add         r1d, r5d
473
    add         r3d, r1d
474
    shl         r1d, 3
475
    sub         r3d, r1d          ; a
476

    
477
    movd         m1, r5d
478
    movd         m3, r3d
479
%ifidn %1, mmx
480
    punpcklwd    m0, m0
481
    punpcklwd    m1, m1
482
    punpcklwd    m3, m3
483
    punpckldq    m0, m0
484
    punpckldq    m1, m1
485
    punpckldq    m3, m3
486
%elifidn %1, mmx2
487
    pshufw       m0, m0, 0x0
488
    pshufw       m1, m1, 0x0
489
    pshufw       m3, m3, 0x0
490
%else
491
    pshuflw      m0, m0, 0x0
492
    pshuflw      m1, m1, 0x0
493
    pshuflw      m3, m3, 0x0
494
    punpcklqdq   m0, m0           ; splat H (words)
495
    punpcklqdq   m1, m1           ; splat V (words)
496
    punpcklqdq   m3, m3           ; splat a (words)
497
%endif
498
%ifidn %3, svq3
499
    SWAP          0, 1
500
%endif
501
    mova         m2, m0
502
%if mmsize == 8
503
    mova         m5, m0
504
%endif
505
    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
506
%if mmsize == 16
507
    psllw        m2, 3
508
%else
509
    psllw        m5, 3
510
    psllw        m2, 2
511
    mova         m6, m5
512
    paddw        m6, m2
513
%endif
514
    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
515
    paddw        m2, m0           ; a + {8,9,10,11,12,13,14,15}*H
516
%if mmsize == 8
517
    paddw        m5, m0           ; a + {8,9,10,11}*H
518
    paddw        m6, m0           ; a + {12,13,14,15}*H
519
%endif
520

    
521
    mov          r4, 8
522
.loop
523
    mova         m3, m0           ; b[0..7]
524
    mova         m4, m2           ; b[8..15]
525
    psraw        m3, 5
526
    psraw        m4, 5
527
    packuswb     m3, m4
528
    mova       [r0], m3
529
%if mmsize == 8
530
    mova         m3, m5           ; b[8..11]
531
    mova         m4, m6           ; b[12..15]
532
    psraw        m3, 5
533
    psraw        m4, 5
534
    packuswb     m3, m4
535
    mova     [r0+8], m3
536
%endif
537
    paddw        m0, m1
538
    paddw        m2, m1
539
%if mmsize == 8
540
    paddw        m5, m1
541
    paddw        m6, m1
542
%endif
543

    
544
    mova         m3, m0           ; b[0..7]
545
    mova         m4, m2           ; b[8..15]
546
    psraw        m3, 5
547
    psraw        m4, 5
548
    packuswb     m3, m4
549
    mova    [r0+r2], m3
550
%if mmsize == 8
551
    mova         m3, m5           ; b[8..11]
552
    mova         m4, m6           ; b[12..15]
553
    psraw        m3, 5
554
    psraw        m4, 5
555
    packuswb     m3, m4
556
    mova  [r0+r2+8], m3
557
%endif
558
    paddw        m0, m1
559
    paddw        m2, m1
560
%if mmsize == 8
561
    paddw        m5, m1
562
    paddw        m6, m1
563
%endif
564

    
565
    lea          r0, [r0+r2*2]
566
    dec          r4
567
    jg .loop
568
    REP_RET
569
%endmacro
570

    
571
INIT_MMX
572
H264_PRED16x16_PLANE mmx,   0, h264
573
H264_PRED16x16_PLANE mmx,   0, rv40
574
H264_PRED16x16_PLANE mmx,   0, svq3
575
H264_PRED16x16_PLANE mmx2,  0, h264
576
H264_PRED16x16_PLANE mmx2,  0, rv40
577
H264_PRED16x16_PLANE mmx2,  0, svq3
578
INIT_XMM
579
H264_PRED16x16_PLANE sse2,  8, h264
580
H264_PRED16x16_PLANE sse2,  8, rv40
581
H264_PRED16x16_PLANE sse2,  8, svq3
582
H264_PRED16x16_PLANE ssse3, 8, h264
583
H264_PRED16x16_PLANE ssse3, 8, rv40
584
H264_PRED16x16_PLANE ssse3, 8, svq3
585

    
586
;-----------------------------------------------------------------------------
587
; void pred8x8_plane(uint8_t *src, int stride)
588
;-----------------------------------------------------------------------------
589

    
590
%macro H264_PRED8x8_PLANE 2
591
cglobal pred8x8_plane_%1, 2, 7, %2
592
    mov          r2, r1           ; +stride
593
    neg          r1               ; -stride
594

    
595
    movd         m0, [r0+r1  -1]
596
%if mmsize == 8
597
    pxor         m2, m2
598
    movh         m1, [r0+r1  +4 ]
599
    punpcklbw    m0, m2
600
    punpcklbw    m1, m2
601
    pmullw       m0, [pw_m4to4]
602
    pmullw       m1, [pw_m4to4+8]
603
%else ; mmsize == 16
604
%ifidn %1, sse2
605
    pxor         m2, m2
606
    movd         m1, [r0+r1  +4]
607
    punpckldq    m0, m1
608
    punpcklbw    m0, m2
609
    pmullw       m0, [pw_m4to4]
610
%else ; ssse3
611
    movhps       m0, [r0+r1  +4]   ; this reads 4 bytes more than necessary
612
    pmaddubsw    m0, [plane8_shuf] ; H coefficients
613
%endif
614
    movhlps      m1, m0
615
%endif
616
    paddw        m0, m1
617

    
618
%ifnidn %1, ssse3
619
%ifidn %1, mmx
620
    mova         m1, m0
621
    psrlq        m1, 32
622
%elifidn %1, mmx2
623
    pshufw       m1, m0, 0xE
624
%else ; mmsize == 16
625
    pshuflw      m1, m0, 0xE
626
%endif
627
    paddw        m0, m1
628
%endif ; !ssse3
629

    
630
%ifidn %1, mmx
631
    mova         m1, m0
632
    psrlq        m1, 16
633
%elifidn %1, mmx2
634
    pshufw       m1, m0, 0x1
635
%else
636
    pshuflw      m1, m0, 0x1
637
%endif
638
    paddw        m0, m1           ; sum of H coefficients
639

    
640
    pmullw       m0, [pw_17]
641
    paddw        m0, [pw_16]
642
    psraw        m0, 5
643

    
644
    lea          r4, [r0+r2*4-1]
645
    lea          r3, [r0     -1]
646
    add          r4, r2
647

    
648
%ifdef ARCH_X86_64
649
%define e_reg r11
650
%else
651
%define e_reg r0
652
%endif
653

    
654
    movzx     e_reg, byte [r3+r2*2   ]
655
    movzx        r5, byte [r4+r1     ]
656
    sub          r5, e_reg
657

    
658
    movzx     e_reg, byte [r3        ]
659
%ifdef ARCH_X86_64
660
    movzx       r10, byte [r4+r2     ]
661
    sub         r10, e_reg
662
    sub          r5, r10
663
%else
664
    movzx        r6, byte [r4+r2     ]
665
    sub          r6, e_reg
666
    lea          r5, [r5+r6*4]
667
    sub          r5, r6
668
%endif
669

    
670
    movzx     e_reg, byte [r3+r1     ]
671
    movzx        r6, byte [r4+r2*2   ]
672
    sub          r6, e_reg
673
%ifdef ARCH_X86_64
674
    add          r6, r10
675
%endif
676
    lea          r5, [r5+r6*4]
677

    
678
    movzx     e_reg, byte [r3+r2     ]
679
    movzx        r6, byte [r4        ]
680
    sub          r6, e_reg
681
    lea          r6, [r5+r6*2]
682

    
683
    lea          r5, [r6*9+16]
684
    lea          r5, [r5+r6*8]
685
    sar          r5, 5
686

    
687
%ifndef ARCH_X86_64
688
    mov          r0, r0m
689
%endif
690

    
691
    movzx        r3, byte [r4+r2*2  ]
692
    movzx        r4, byte [r0+r1  +7]
693
    lea          r3, [r3+r4+1]
694
    shl          r3, 4
695
    movd        r1d, m0
696
    movsx       r1d, r1w
697
    add         r1d, r5d
698
    sub         r3d, r1d
699
    add         r1d, r1d
700
    sub         r3d, r1d          ; a
701

    
702
    movd         m1, r5d
703
    movd         m3, r3d
704
%ifidn %1, mmx
705
    punpcklwd    m0, m0
706
    punpcklwd    m1, m1
707
    punpcklwd    m3, m3
708
    punpckldq    m0, m0
709
    punpckldq    m1, m1
710
    punpckldq    m3, m3
711
%elifidn %1, mmx2
712
    pshufw       m0, m0, 0x0
713
    pshufw       m1, m1, 0x0
714
    pshufw       m3, m3, 0x0
715
%else
716
    pshuflw      m0, m0, 0x0
717
    pshuflw      m1, m1, 0x0
718
    pshuflw      m3, m3, 0x0
719
    punpcklqdq   m0, m0           ; splat H (words)
720
    punpcklqdq   m1, m1           ; splat V (words)
721
    punpcklqdq   m3, m3           ; splat a (words)
722
%endif
723
%if mmsize == 8
724
    mova         m2, m0
725
%endif
726
    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
727
    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
728
%if mmsize == 8
729
    psllw        m2, 2
730
    paddw        m2, m0           ; a + {4,5,6,7}*H
731
%endif
732

    
733
    mov          r4, 4
734
ALIGN 16
735
.loop
736
%if mmsize == 16
737
    mova         m3, m0           ; b[0..7]
738
    paddw        m0, m1
739
    psraw        m3, 5
740
    mova         m4, m0           ; V+b[0..7]
741
    paddw        m0, m1
742
    psraw        m4, 5
743
    packuswb     m3, m4
744
    movh       [r0], m3
745
    movhps  [r0+r2], m3
746
%else ; mmsize == 8
747
    mova         m3, m0           ; b[0..3]
748
    mova         m4, m2           ; b[4..7]
749
    paddw        m0, m1
750
    paddw        m2, m1
751
    psraw        m3, 5
752
    psraw        m4, 5
753
    mova         m5, m0           ; V+b[0..3]
754
    mova         m6, m2           ; V+b[4..7]
755
    paddw        m0, m1
756
    paddw        m2, m1
757
    psraw        m5, 5
758
    psraw        m6, 5
759
    packuswb     m3, m4
760
    packuswb     m5, m6
761
    mova       [r0], m3
762
    mova    [r0+r2], m5
763
%endif
764

    
765
    lea          r0, [r0+r2*2]
766
    dec          r4
767
    jg .loop
768
    REP_RET
769
%endmacro
770

    
771
INIT_MMX
772
H264_PRED8x8_PLANE mmx,   0
773
H264_PRED8x8_PLANE mmx2,  0
774
INIT_XMM
775
H264_PRED8x8_PLANE sse2,  8
776
H264_PRED8x8_PLANE ssse3, 8
777

    
778
;-----------------------------------------------------------------------------
779
; void pred8x8_vertical(uint8_t *src, int stride)
780
;-----------------------------------------------------------------------------
781

    
782
cglobal pred8x8_vertical_mmx, 2,2
783
    sub    r0, r1
784
    movq  mm0, [r0]
785
%rep 3
786
    movq [r0+r1*1], mm0
787
    movq [r0+r1*2], mm0
788
    lea    r0, [r0+r1*2]
789
%endrep
790
    movq [r0+r1*1], mm0
791
    movq [r0+r1*2], mm0
792
    RET
793

    
794
;-----------------------------------------------------------------------------
795
; void pred8x8_horizontal(uint8_t *src, int stride)
796
;-----------------------------------------------------------------------------
797

    
798
%macro PRED8x8_H 1
799
cglobal pred8x8_horizontal_%1, 2,3
800
    mov       r2, 4
801
%ifidn %1, ssse3
802
    mova      m2, [pb_3]
803
%endif
804
.loop:
805
    movd      m0, [r0+r1*0-4]
806
    movd      m1, [r0+r1*1-4]
807
%ifidn %1, ssse3
808
    pshufb    m0, m2
809
    pshufb    m1, m2
810
%else
811
    punpcklbw m0, m0
812
    punpcklbw m1, m1
813
%ifidn %1, mmxext
814
    pshufw    m0, m0, 0xff
815
    pshufw    m1, m1, 0xff
816
%else
817
    punpckhwd m0, m0
818
    punpckhwd m1, m1
819
    punpckhdq m0, m0
820
    punpckhdq m1, m1
821
%endif
822
%endif
823
    mova [r0+r1*0], m0
824
    mova [r0+r1*1], m1
825
    lea       r0, [r0+r1*2]
826
    dec       r2
827
    jg .loop
828
    REP_RET
829
%endmacro
830

    
831
INIT_MMX
832
PRED8x8_H mmx
833
PRED8x8_H mmxext
834
PRED8x8_H ssse3
835

    
836
;-----------------------------------------------------------------------------
837
; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
838
;-----------------------------------------------------------------------------
839
%ifdef CONFIG_GPL
840
cglobal pred8x8_top_dc_mmxext, 2,5
841
    sub         r0, r1
842
    movq       mm0, [r0]
843
    pxor       mm1, mm1
844
    pxor       mm2, mm2
845
    lea         r2, [r0+r1*2]
846
    punpckhbw  mm1, mm0
847
    punpcklbw  mm0, mm2
848
    psadbw     mm1, mm2        ; s1
849
    lea         r3, [r2+r1*2]
850
    psadbw     mm0, mm2        ; s0
851
    psrlw      mm1, 1
852
    psrlw      mm0, 1
853
    pavgw      mm1, mm2
854
    lea         r4, [r3+r1*2]
855
    pavgw      mm0, mm2
856
    pshufw     mm1, mm1, 0
857
    pshufw     mm0, mm0, 0     ; dc0 (w)
858
    packuswb   mm0, mm1        ; dc0,dc1 (b)
859
    movq [r0+r1*1], mm0
860
    movq [r0+r1*2], mm0
861
    lea         r0, [r3+r1*2]
862
    movq [r2+r1*1], mm0
863
    movq [r2+r1*2], mm0
864
    movq [r3+r1*1], mm0
865
    movq [r3+r1*2], mm0
866
    movq [r0+r1*1], mm0
867
    movq [r0+r1*2], mm0
868
    RET
869

    
870
;-----------------------------------------------------------------------------
871
; void pred8x8_dc_mmxext(uint8_t *src, int stride)
872
;-----------------------------------------------------------------------------
873

    
874
INIT_MMX
875
cglobal pred8x8_dc_mmxext, 2,5
876
    sub       r0, r1
877
    pxor      m7, m7
878
    movd      m0, [r0+0]
879
    movd      m1, [r0+4]
880
    psadbw    m0, m7            ; s0
881
    mov       r4, r0
882
    psadbw    m1, m7            ; s1
883

    
884
    movzx    r2d, byte [r0+r1*1-1]
885
    movzx    r3d, byte [r0+r1*2-1]
886
    lea       r0, [r0+r1*2]
887
    add      r2d, r3d
888
    movzx    r3d, byte [r0+r1*1-1]
889
    add      r2d, r3d
890
    movzx    r3d, byte [r0+r1*2-1]
891
    add      r2d, r3d
892
    lea       r0, [r0+r1*2]
893
    movd      m2, r2d            ; s2
894
    movzx    r2d, byte [r0+r1*1-1]
895
    movzx    r3d, byte [r0+r1*2-1]
896
    lea       r0, [r0+r1*2]
897
    add      r2d, r3d
898
    movzx    r3d, byte [r0+r1*1-1]
899
    add      r2d, r3d
900
    movzx    r3d, byte [r0+r1*2-1]
901
    add      r2d, r3d
902
    movd      m3, r2d            ; s3
903

    
904
    punpcklwd m0, m1
905
    mov       r0, r4
906
    punpcklwd m2, m3
907
    punpckldq m0, m2            ; s0, s1, s2, s3
908
    pshufw    m3, m0, 11110110b ; s2, s1, s3, s3
909
    lea       r2, [r0+r1*2]
910
    pshufw    m0, m0, 01110100b ; s0, s1, s3, s1
911
    paddw     m0, m3
912
    lea       r3, [r2+r1*2]
913
    psrlw     m0, 2
914
    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
915
    lea       r4, [r3+r1*2]
916
    packuswb  m0, m0
917
    punpcklbw m0, m0
918
    movq      m1, m0
919
    punpcklbw m0, m0
920
    punpckhbw m1, m1
921
    movq [r0+r1*1], m0
922
    movq [r0+r1*2], m0
923
    movq [r2+r1*1], m0
924
    movq [r2+r1*2], m0
925
    movq [r3+r1*1], m1
926
    movq [r3+r1*2], m1
927
    movq [r4+r1*1], m1
928
    movq [r4+r1*2], m1
929
    RET
930
%endif
931

    
932
;-----------------------------------------------------------------------------
933
; void pred8x8_dc_rv40(uint8_t *src, int stride)
934
;-----------------------------------------------------------------------------
935

    
936
cglobal pred8x8_dc_rv40_mmxext, 2,7
937
    mov       r4, r0
938
    sub       r0, r1
939
    pxor      mm0, mm0
940
    psadbw    mm0, [r0]
941
    dec        r0
942
    movzx     r5d, byte [r0+r1*1]
943
    movd      r6d, mm0
944
    lea        r0, [r0+r1*2]
945
%rep 3
946
    movzx     r2d, byte [r0+r1*0]
947
    movzx     r3d, byte [r0+r1*1]
948
    add       r5d, r2d
949
    add       r6d, r3d
950
    lea        r0, [r0+r1*2]
951
%endrep
952
    movzx     r2d, byte [r0+r1*0]
953
    add       r5d, r6d
954
    lea       r2d, [r2+r5+8]
955
    shr       r2d, 4
956
    movd      mm0, r2d
957
    punpcklbw mm0, mm0
958
    pshufw    mm0, mm0, 0
959
    mov       r3d, 4
960
.loop:
961
    movq [r4+r1*0], mm0
962
    movq [r4+r1*1], mm0
963
    lea   r4, [r4+r1*2]
964
    dec   r3d
965
    jg .loop
966
    REP_RET
967

    
968
;-----------------------------------------------------------------------------
969
; void pred8x8_tm_vp8(uint8_t *src, int stride)
970
;-----------------------------------------------------------------------------
971

    
972
%macro PRED8x8_TM_MMX 1
973
cglobal pred8x8_tm_vp8_%1, 2,6
974
    sub        r0, r1
975
    pxor      mm7, mm7
976
    movq      mm0, [r0]
977
    movq      mm1, mm0
978
    punpcklbw mm0, mm7
979
    punpckhbw mm1, mm7
980
    movzx     r4d, byte [r0-1]
981
    mov       r5d, 4
982
.loop:
983
    movzx     r2d, byte [r0+r1*1-1]
984
    movzx     r3d, byte [r0+r1*2-1]
985
    sub       r2d, r4d
986
    sub       r3d, r4d
987
    movd      mm2, r2d
988
    movd      mm4, r3d
989
%ifidn %1, mmx
990
    punpcklwd mm2, mm2
991
    punpcklwd mm4, mm4
992
    punpckldq mm2, mm2
993
    punpckldq mm4, mm4
994
%else
995
    pshufw    mm2, mm2, 0
996
    pshufw    mm4, mm4, 0
997
%endif
998
    movq      mm3, mm2
999
    movq      mm5, mm4
1000
    paddw     mm2, mm0
1001
    paddw     mm3, mm1
1002
    paddw     mm4, mm0
1003
    paddw     mm5, mm1
1004
    packuswb  mm2, mm3
1005
    packuswb  mm4, mm5
1006
    movq [r0+r1*1], mm2
1007
    movq [r0+r1*2], mm4
1008
    lea        r0, [r0+r1*2]
1009
    dec       r5d
1010
    jg .loop
1011
    REP_RET
1012
%endmacro
1013

    
1014
PRED8x8_TM_MMX mmx
1015
PRED8x8_TM_MMX mmxext
1016

    
1017
cglobal pred8x8_tm_vp8_sse2, 2,6,4
1018
    sub          r0, r1
1019
    pxor       xmm1, xmm1
1020
    movq       xmm0, [r0]
1021
    punpcklbw  xmm0, xmm1
1022
    movzx       r4d, byte [r0-1]
1023
    mov         r5d, 4
1024
.loop:
1025
    movzx       r2d, byte [r0+r1*1-1]
1026
    movzx       r3d, byte [r0+r1*2-1]
1027
    sub         r2d, r4d
1028
    sub         r3d, r4d
1029
    movd       xmm2, r2d
1030
    movd       xmm3, r3d
1031
    pshuflw    xmm2, xmm2, 0
1032
    pshuflw    xmm3, xmm3, 0
1033
    punpcklqdq xmm2, xmm2
1034
    punpcklqdq xmm3, xmm3
1035
    paddw      xmm2, xmm0
1036
    paddw      xmm3, xmm0
1037
    packuswb   xmm2, xmm3
1038
    movq   [r0+r1*1], xmm2
1039
    movhps [r0+r1*2], xmm2
1040
    lea          r0, [r0+r1*2]
1041
    dec         r5d
1042
    jg .loop
1043
    REP_RET
1044

    
1045
cglobal pred8x8_tm_vp8_ssse3, 2,3,6
1046
    sub          r0, r1
1047
    movdqa     xmm4, [tm_shuf]
1048
    pxor       xmm1, xmm1
1049
    movq       xmm0, [r0]
1050
    punpcklbw  xmm0, xmm1
1051
    movd       xmm5, [r0-4]
1052
    pshufb     xmm5, xmm4
1053
    mov         r2d, 4
1054
.loop:
1055
    movd       xmm2, [r0+r1*1-4]
1056
    movd       xmm3, [r0+r1*2-4]
1057
    pshufb     xmm2, xmm4
1058
    pshufb     xmm3, xmm4
1059
    psubw      xmm2, xmm5
1060
    psubw      xmm3, xmm5
1061
    paddw      xmm2, xmm0
1062
    paddw      xmm3, xmm0
1063
    packuswb   xmm2, xmm3
1064
    movq   [r0+r1*1], xmm2
1065
    movhps [r0+r1*2], xmm2
1066
    lea          r0, [r0+r1*2]
1067
    dec         r2d
1068
    jg .loop
1069
    REP_RET
1070

    
1071
; dest, left, right, src, tmp
1072
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1073
%macro PRED4x4_LOWPASS 5
1074
    mova    %5, %2
1075
    pavgb   %2, %3
1076
    pxor    %3, %5
1077
    mova    %1, %4
1078
    pand    %3, [pb_1]
1079
    psubusb %2, %3
1080
    pavgb   %1, %2
1081
%endmacro
1082

    
1083
;-----------------------------------------------------------------------------
1084
; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1085
;-----------------------------------------------------------------------------
1086
%ifdef CONFIG_GPL
1087
%macro PRED8x8L_TOP_DC 1
1088
cglobal pred8x8l_top_dc_%1, 4,4
1089
    sub          r0, r3
1090
    pxor        mm7, mm7
1091
    movq        mm0, [r0-8]
1092
    movq        mm3, [r0]
1093
    movq        mm1, [r0+8]
1094
    movq        mm2, mm3
1095
    movq        mm4, mm3
1096
    PALIGNR     mm2, mm0, 7, mm0
1097
    PALIGNR     mm1, mm4, 1, mm4
1098
    test         r1, r1 ; top_left
1099
    jz .fix_lt_2
1100
    test         r2, r2 ; top_right
1101
    jz .fix_tr_1
1102
    jmp .body
1103
.fix_lt_2:
1104
    movq        mm5, mm3
1105
    pxor        mm5, mm2
1106
    psllq       mm5, 56
1107
    psrlq       mm5, 56
1108
    pxor        mm2, mm5
1109
    test         r2, r2 ; top_right
1110
    jnz .body
1111
.fix_tr_1:
1112
    movq        mm5, mm3
1113
    pxor        mm5, mm1
1114
    psrlq       mm5, 56
1115
    psllq       mm5, 56
1116
    pxor        mm1, mm5
1117
.body
1118
    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1119
    psadbw   mm7, mm0
1120
    paddw    mm7, [pw_4]
1121
    psrlw    mm7, 3
1122
    pshufw   mm7, mm7, 0
1123
    packuswb mm7, mm7
1124
%rep 3
1125
    movq [r0+r3*1], mm7
1126
    movq [r0+r3*2], mm7
1127
    lea    r0, [r0+r3*2]
1128
%endrep
1129
    movq [r0+r3*1], mm7
1130
    movq [r0+r3*2], mm7
1131
    RET
1132
%endmacro
1133

    
1134
INIT_MMX
1135
%define PALIGNR PALIGNR_MMX
1136
PRED8x8L_TOP_DC mmxext
1137
%define PALIGNR PALIGNR_SSSE3
1138
PRED8x8L_TOP_DC ssse3
1139

    
1140
;-----------------------------------------------------------------------------
1141
;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1142
;-----------------------------------------------------------------------------
1143

    
1144
%macro PRED8x8L_DC 1
1145
cglobal pred8x8l_dc_%1, 4,5
1146
    sub          r0, r3
1147
    lea          r4, [r0+r3*2]
1148
    movq        mm0, [r0+r3*1-8]
1149
    punpckhbw   mm0, [r0+r3*0-8]
1150
    movq        mm1, [r4+r3*1-8]
1151
    punpckhbw   mm1, [r0+r3*2-8]
1152
    mov          r4, r0
1153
    punpckhwd   mm1, mm0
1154
    lea          r0, [r0+r3*4]
1155
    movq        mm2, [r0+r3*1-8]
1156
    punpckhbw   mm2, [r0+r3*0-8]
1157
    lea          r0, [r0+r3*2]
1158
    movq        mm3, [r0+r3*1-8]
1159
    punpckhbw   mm3, [r0+r3*0-8]
1160
    punpckhwd   mm3, mm2
1161
    punpckhdq   mm3, mm1
1162
    lea          r0, [r0+r3*2]
1163
    movq        mm0, [r0+r3*0-8]
1164
    movq        mm1, [r4]
1165
    mov          r0, r4
1166
    movq        mm4, mm3
1167
    movq        mm2, mm3
1168
    PALIGNR     mm4, mm0, 7, mm0
1169
    PALIGNR     mm1, mm2, 1, mm2
1170
    test        r1, r1
1171
    jnz .do_left
1172
.fix_lt_1:
1173
    movq        mm5, mm3
1174
    pxor        mm5, mm4
1175
    psrlq       mm5, 56
1176
    psllq       mm5, 48
1177
    pxor        mm1, mm5
1178
    jmp .do_left
1179
.fix_lt_2:
1180
    movq        mm5, mm3
1181
    pxor        mm5, mm2
1182
    psllq       mm5, 56
1183
    psrlq       mm5, 56
1184
    pxor        mm2, mm5
1185
    test         r2, r2
1186
    jnz .body
1187
.fix_tr_1:
1188
    movq        mm5, mm3
1189
    pxor        mm5, mm1
1190
    psrlq       mm5, 56
1191
    psllq       mm5, 56
1192
    pxor        mm1, mm5
1193
    jmp .body
1194
.do_left:
1195
    movq        mm0, mm4
1196
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1197
    movq        mm4, mm0
1198
    movq        mm7, mm2
1199
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1200
    psllq       mm1, 56
1201
    PALIGNR     mm7, mm1, 7, mm3
1202
    movq        mm0, [r0-8]
1203
    movq        mm3, [r0]
1204
    movq        mm1, [r0+8]
1205
    movq        mm2, mm3
1206
    movq        mm4, mm3
1207
    PALIGNR     mm2, mm0, 7, mm0
1208
    PALIGNR     mm1, mm4, 1, mm4
1209
    test         r1, r1
1210
    jz .fix_lt_2
1211
    test         r2, r2
1212
    jz .fix_tr_1
1213
.body
1214
    lea          r1, [r0+r3*2]
1215
    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1216
    pxor        mm0, mm0
1217
    pxor        mm1, mm1
1218
    lea          r2, [r1+r3*2]
1219
    psadbw      mm0, mm7
1220
    psadbw      mm1, mm6
1221
    paddw       mm0, [pw_8]
1222
    paddw       mm0, mm1
1223
    lea          r4, [r2+r3*2]
1224
    psrlw       mm0, 4
1225
    pshufw      mm0, mm0, 0
1226
    packuswb    mm0, mm0
1227
    movq [r0+r3*1], mm0
1228
    movq [r0+r3*2], mm0
1229
    movq [r1+r3*1], mm0
1230
    movq [r1+r3*2], mm0
1231
    movq [r2+r3*1], mm0
1232
    movq [r2+r3*2], mm0
1233
    movq [r4+r3*1], mm0
1234
    movq [r4+r3*2], mm0
1235
    RET
1236
%endmacro
1237
INIT_MMX
1238
%define PALIGNR PALIGNR_MMX
1239
PRED8x8L_DC mmxext
1240
%define PALIGNR PALIGNR_SSSE3
1241
PRED8x8L_DC ssse3
1242

    
1243
;-----------------------------------------------------------------------------
1244
; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
1245
;-----------------------------------------------------------------------------
1246

    
1247
%macro PRED8x8L_HORIZONTAL 1
1248
cglobal pred8x8l_horizontal_%1, 4,4
1249
    sub          r0, r3
1250
    lea          r2, [r0+r3*2]
1251
    movq        mm0, [r0+r3*1-8]
1252
    test         r1, r1
1253
    lea          r1, [r0+r3]
1254
    cmovnz       r1, r0
1255
    punpckhbw   mm0, [r1+r3*0-8]
1256
    movq        mm1, [r2+r3*1-8]
1257
    punpckhbw   mm1, [r0+r3*2-8]
1258
    mov          r2, r0
1259
    punpckhwd   mm1, mm0
1260
    lea          r0, [r0+r3*4]
1261
    movq        mm2, [r0+r3*1-8]
1262
    punpckhbw   mm2, [r0+r3*0-8]
1263
    lea          r0, [r0+r3*2]
1264
    movq        mm3, [r0+r3*1-8]
1265
    punpckhbw   mm3, [r0+r3*0-8]
1266
    punpckhwd   mm3, mm2
1267
    punpckhdq   mm3, mm1
1268
    lea          r0, [r0+r3*2]
1269
    movq        mm0, [r0+r3*0-8]
1270
    movq        mm1, [r1+r3*0-8]
1271
    mov          r0, r2
1272
    movq        mm4, mm3
1273
    movq        mm2, mm3
1274
    PALIGNR     mm4, mm0, 7, mm0
1275
    PALIGNR     mm1, mm2, 1, mm2
1276
    movq        mm0, mm4
1277
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1278
    movq        mm4, mm0
1279
    movq        mm7, mm2
1280
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1281
    psllq       mm1, 56
1282
    PALIGNR     mm7, mm1, 7, mm3
1283
    movq        mm3, mm7
1284
    lea         r1, [r0+r3*2]
1285
    movq       mm7, mm3
1286
    punpckhbw  mm3, mm3
1287
    punpcklbw  mm7, mm7
1288
    pshufw     mm0, mm3, 0xff
1289
    pshufw     mm1, mm3, 0xaa
1290
    lea         r2, [r1+r3*2]
1291
    pshufw     mm2, mm3, 0x55
1292
    pshufw     mm3, mm3, 0x00
1293
    pshufw     mm4, mm7, 0xff
1294
    pshufw     mm5, mm7, 0xaa
1295
    pshufw     mm6, mm7, 0x55
1296
    pshufw     mm7, mm7, 0x00
1297
    movq [r0+r3*1], mm0
1298
    movq [r0+r3*2], mm1
1299
    movq [r1+r3*1], mm2
1300
    movq [r1+r3*2], mm3
1301
    movq [r2+r3*1], mm4
1302
    movq [r2+r3*2], mm5
1303
    lea         r0, [r2+r3*2]
1304
    movq [r0+r3*1], mm6
1305
    movq [r0+r3*2], mm7
1306
    RET
1307
%endmacro
1308

    
1309
INIT_MMX
1310
%define PALIGNR PALIGNR_MMX
1311
PRED8x8L_HORIZONTAL mmxext
1312
%define PALIGNR PALIGNR_SSSE3
1313
PRED8x8L_HORIZONTAL ssse3
1314

    
1315
;-----------------------------------------------------------------------------
1316
; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
1317
;-----------------------------------------------------------------------------
1318

    
1319
%macro PRED8x8L_VERTICAL 1
1320
cglobal pred8x8l_vertical_%1, 4,4
1321
    sub          r0, r3
1322
    movq        mm0, [r0-8]
1323
    movq        mm3, [r0]
1324
    movq        mm1, [r0+8]
1325
    movq        mm2, mm3
1326
    movq        mm4, mm3
1327
    PALIGNR     mm2, mm0, 7, mm0
1328
    PALIGNR     mm1, mm4, 1, mm4
1329
    test         r1, r1 ; top_left
1330
    jz .fix_lt_2
1331
    test         r2, r2 ; top_right
1332
    jz .fix_tr_1
1333
    jmp .body
1334
.fix_lt_2:
1335
    movq        mm5, mm3
1336
    pxor        mm5, mm2
1337
    psllq       mm5, 56
1338
    psrlq       mm5, 56
1339
    pxor        mm2, mm5
1340
    test         r2, r2 ; top_right
1341
    jnz .body
1342
.fix_tr_1:
1343
    movq        mm5, mm3
1344
    pxor        mm5, mm1
1345
    psrlq       mm5, 56
1346
    psllq       mm5, 56
1347
    pxor        mm1, mm5
1348
.body
1349
    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1350
%rep 3
1351
    movq [r0+r3*1], mm0
1352
    movq [r0+r3*2], mm0
1353
    lea    r0, [r0+r3*2]
1354
%endrep
1355
    movq [r0+r3*1], mm0
1356
    movq [r0+r3*2], mm0
1357
    RET
1358
%endmacro
1359

    
1360
INIT_MMX
1361
%define PALIGNR PALIGNR_MMX
1362
PRED8x8L_VERTICAL mmxext
1363
%define PALIGNR PALIGNR_SSSE3
1364
PRED8x8L_VERTICAL ssse3
1365

    
1366
;-----------------------------------------------------------------------------
1367
;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1368
;-----------------------------------------------------------------------------
1369

    
1370
INIT_MMX
1371
%define PALIGNR PALIGNR_MMX
1372
cglobal pred8x8l_down_left_mmxext, 4,5
1373
    sub          r0, r3
1374
    movq        mm0, [r0-8]
1375
    movq        mm3, [r0]
1376
    movq        mm1, [r0+8]
1377
    movq        mm2, mm3
1378
    movq        mm4, mm3
1379
    PALIGNR     mm2, mm0, 7, mm0
1380
    PALIGNR     mm1, mm4, 1, mm4
1381
    test         r1, r1
1382
    jz .fix_lt_2
1383
    test         r2, r2
1384
    jz .fix_tr_1
1385
    jmp .do_top
1386
.fix_lt_2:
1387
    movq        mm5, mm3
1388
    pxor        mm5, mm2
1389
    psllq       mm5, 56
1390
    psrlq       mm5, 56
1391
    pxor        mm2, mm5
1392
    test         r2, r2
1393
    jnz .do_top
1394
.fix_tr_1:
1395
    movq        mm5, mm3
1396
    pxor        mm5, mm1
1397
    psrlq       mm5, 56
1398
    psllq       mm5, 56
1399
    pxor        mm1, mm5
1400
    jmp .do_top
1401
.fix_tr_2:
1402
    punpckhbw   mm3, mm3
1403
    pshufw      mm1, mm3, 0xFF
1404
    jmp .do_topright
1405
.do_top:
1406
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1407
    movq        mm7, mm4
1408
    test         r2, r2
1409
    jz .fix_tr_2
1410
    movq        mm0, [r0+8]
1411
    movq        mm5, mm0
1412
    movq        mm2, mm0
1413
    movq        mm4, mm0
1414
    psrlq       mm5, 56
1415
    PALIGNR     mm2, mm3, 7, mm3
1416
    PALIGNR     mm5, mm4, 1, mm4
1417
    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1418
.do_topright:
1419
    lea          r1, [r0+r3*2]
1420
    movq        mm6, mm1
1421
    psrlq       mm1, 56
1422
    movq        mm4, mm1
1423
    lea          r2, [r1+r3*2]
1424
    movq        mm2, mm6
1425
    PALIGNR     mm2, mm7, 1, mm0
1426
    movq        mm3, mm6
1427
    PALIGNR     mm3, mm7, 7, mm0
1428
    PALIGNR     mm4, mm6, 1, mm0
1429
    movq        mm5, mm7
1430
    movq        mm1, mm7
1431
    movq        mm7, mm6
1432
    lea          r4, [r2+r3*2]
1433
    psllq       mm1, 8
1434
    PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1435
    PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1436
    movq  [r4+r3*2], mm1
1437
    movq        mm2, mm0
1438
    psllq       mm1, 8
1439
    psrlq       mm2, 56
1440
    psllq       mm0, 8
1441
    por         mm1, mm2
1442
    movq  [r4+r3*1], mm1
1443
    movq        mm2, mm0
1444
    psllq       mm1, 8
1445
    psrlq       mm2, 56
1446
    psllq       mm0, 8
1447
    por         mm1, mm2
1448
    movq  [r2+r3*2], mm1
1449
    movq        mm2, mm0
1450
    psllq       mm1, 8
1451
    psrlq       mm2, 56
1452
    psllq       mm0, 8
1453
    por         mm1, mm2
1454
    movq  [r2+r3*1], mm1
1455
    movq        mm2, mm0
1456
    psllq       mm1, 8
1457
    psrlq       mm2, 56
1458
    psllq       mm0, 8
1459
    por         mm1, mm2
1460
    movq  [r1+r3*2], mm1
1461
    movq        mm2, mm0
1462
    psllq       mm1, 8
1463
    psrlq       mm2, 56
1464
    psllq       mm0, 8
1465
    por         mm1, mm2
1466
    movq  [r1+r3*1], mm1
1467
    movq        mm2, mm0
1468
    psllq       mm1, 8
1469
    psrlq       mm2, 56
1470
    psllq       mm0, 8
1471
    por         mm1, mm2
1472
    movq  [r0+r3*2], mm1
1473
    psllq       mm1, 8
1474
    psrlq       mm0, 56
1475
    por         mm1, mm0
1476
    movq  [r0+r3*1], mm1
1477
    RET
1478

    
1479
%macro PRED8x8L_DOWN_LEFT 1
1480
cglobal pred8x8l_down_left_%1, 4,4
1481
    sub          r0, r3
1482
    movq        mm0, [r0-8]
1483
    movq        mm3, [r0]
1484
    movq        mm1, [r0+8]
1485
    movq        mm2, mm3
1486
    movq        mm4, mm3
1487
    PALIGNR     mm2, mm0, 7, mm0
1488
    PALIGNR     mm1, mm4, 1, mm4
1489
    test         r1, r1 ; top_left
1490
    jz .fix_lt_2
1491
    test         r2, r2 ; top_right
1492
    jz .fix_tr_1
1493
    jmp .do_top
1494
.fix_lt_2:
1495
    movq        mm5, mm3
1496
    pxor        mm5, mm2
1497
    psllq       mm5, 56
1498
    psrlq       mm5, 56
1499
    pxor        mm2, mm5
1500
    test         r2, r2 ; top_right
1501
    jnz .do_top
1502
.fix_tr_1:
1503
    movq        mm5, mm3
1504
    pxor        mm5, mm1
1505
    psrlq       mm5, 56
1506
    psllq       mm5, 56
1507
    pxor        mm1, mm5
1508
    jmp .do_top
1509
.fix_tr_2:
1510
    punpckhbw   mm3, mm3
1511
    pshufw      mm1, mm3, 0xFF
1512
    jmp .do_topright
1513
.do_top:
1514
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1515
    movq2dq    xmm3, mm4
1516
    test         r2, r2 ; top_right
1517
    jz .fix_tr_2
1518
    movq        mm0, [r0+8]
1519
    movq        mm5, mm0
1520
    movq        mm2, mm0
1521
    movq        mm4, mm0
1522
    psrlq       mm5, 56
1523
    PALIGNR     mm2, mm3, 7, mm3
1524
    PALIGNR     mm5, mm4, 1, mm4
1525
    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1526
.do_topright:
1527
    movq2dq    xmm4, mm1
1528
    psrlq       mm1, 56
1529
    movq2dq    xmm5, mm1
1530
    lea         r1, [r0+r3*2]
1531
    pslldq    xmm4, 8
1532
    por       xmm3, xmm4
1533
    movdqa    xmm2, xmm3
1534
    psrldq    xmm2, 1
1535
    pslldq    xmm5, 15
1536
    por       xmm2, xmm5
1537
    lea         r2, [r1+r3*2]
1538
    movdqa    xmm1, xmm3
1539
    pslldq    xmm1, 1
1540
INIT_XMM
1541
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1542
    psrldq    xmm0, 1
1543
    movq [r0+r3*1], xmm0
1544
    psrldq    xmm0, 1
1545
    movq [r0+r3*2], xmm0
1546
    psrldq    xmm0, 1
1547
    lea         r0, [r2+r3*2]
1548
    movq [r1+r3*1], xmm0
1549
    psrldq    xmm0, 1
1550
    movq [r1+r3*2], xmm0
1551
    psrldq    xmm0, 1
1552
    movq [r2+r3*1], xmm0
1553
    psrldq    xmm0, 1
1554
    movq [r2+r3*2], xmm0
1555
    psrldq    xmm0, 1
1556
    movq [r0+r3*1], xmm0
1557
    psrldq    xmm0, 1
1558
    movq [r0+r3*2], xmm0
1559
    RET
1560
%endmacro
1561

    
1562
INIT_MMX
1563
%define PALIGNR PALIGNR_MMX
1564
PRED8x8L_DOWN_LEFT sse2
1565
INIT_MMX
1566
%define PALIGNR PALIGNR_SSSE3
1567
PRED8x8L_DOWN_LEFT ssse3
1568

    
1569
;-----------------------------------------------------------------------------
1570
;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
1571
;-----------------------------------------------------------------------------
1572

    
1573
INIT_MMX
1574
%define PALIGNR PALIGNR_MMX
1575
cglobal pred8x8l_down_right_mmxext, 4,5
1576
    sub          r0, r3
1577
    lea          r4, [r0+r3*2]
1578
    movq        mm0, [r0+r3*1-8]
1579
    punpckhbw   mm0, [r0+r3*0-8]
1580
    movq        mm1, [r4+r3*1-8]
1581
    punpckhbw   mm1, [r0+r3*2-8]
1582
    mov          r4, r0
1583
    punpckhwd   mm1, mm0
1584
    lea          r0, [r0+r3*4]
1585
    movq        mm2, [r0+r3*1-8]
1586
    punpckhbw   mm2, [r0+r3*0-8]
1587
    lea          r0, [r0+r3*2]
1588
    movq        mm3, [r0+r3*1-8]
1589
    punpckhbw   mm3, [r0+r3*0-8]
1590
    punpckhwd   mm3, mm2
1591
    punpckhdq   mm3, mm1
1592
    lea          r0, [r0+r3*2]
1593
    movq        mm0, [r0+r3*0-8]
1594
    movq        mm1, [r4]
1595
    mov          r0, r4
1596
    movq        mm4, mm3
1597
    movq        mm2, mm3
1598
    PALIGNR     mm4, mm0, 7, mm0
1599
    PALIGNR     mm1, mm2, 1, mm2
1600
    test        r1, r1 ; top_left
1601
    jz .fix_lt_1
1602
.do_left:
1603
    movq        mm0, mm4
1604
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1605
    movq        mm4, mm0
1606
    movq        mm7, mm2
1607
    movq        mm6, mm2
1608
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1609
    psllq       mm1, 56
1610
    PALIGNR     mm7, mm1, 7, mm3
1611
    movq        mm0, [r0-8]
1612
    movq        mm3, [r0]
1613
    movq        mm1, [r0+8]
1614
    movq        mm2, mm3
1615
    movq        mm4, mm3
1616
    PALIGNR     mm2, mm0, 7, mm0
1617
    PALIGNR     mm1, mm4, 1, mm4
1618
    test         r1, r1 ; top_left
1619
    jz .fix_lt_2
1620
    test         r2, r2 ; top_right
1621
    jz .fix_tr_1
1622
.do_top:
1623
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1624
    movq        mm5, mm4
1625
    jmp .body
1626
.fix_lt_1:
1627
    movq        mm5, mm3
1628
    pxor        mm5, mm4
1629
    psrlq       mm5, 56
1630
    psllq       mm5, 48
1631
    pxor        mm1, mm5
1632
    jmp .do_left
1633
.fix_lt_2:
1634
    movq        mm5, mm3
1635
    pxor        mm5, mm2
1636
    psllq       mm5, 56
1637
    psrlq       mm5, 56
1638
    pxor        mm2, mm5
1639
    test         r2, r2 ; top_right
1640
    jnz .do_top
1641
.fix_tr_1:
1642
    movq        mm5, mm3
1643
    pxor        mm5, mm1
1644
    psrlq       mm5, 56
1645
    psllq       mm5, 56
1646
    pxor        mm1, mm5
1647
    jmp .do_top
1648
.body
1649
    lea         r1, [r0+r3*2]
1650
    movq       mm1, mm7
1651
    movq       mm7, mm5
1652
    movq       mm5, mm6
1653
    movq       mm2, mm7
1654
    lea         r2, [r1+r3*2]
1655
    PALIGNR    mm2, mm6, 1, mm0
1656
    movq       mm3, mm7
1657
    PALIGNR    mm3, mm6, 7, mm0
1658
    movq       mm4, mm7
1659
    lea         r4, [r2+r3*2]
1660
    psrlq      mm4, 8
1661
    PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1662
    PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1663
    movq [r4+r3*2], mm0
1664
    movq       mm2, mm1
1665
    psrlq      mm0, 8
1666
    psllq      mm2, 56
1667
    psrlq      mm1, 8
1668
    por        mm0, mm2
1669
    movq [r4+r3*1], mm0
1670
    movq       mm2, mm1
1671
    psrlq      mm0, 8
1672
    psllq      mm2, 56
1673
    psrlq      mm1, 8
1674
    por        mm0, mm2
1675
    movq [r2+r3*2], mm0
1676
    movq       mm2, mm1
1677
    psrlq      mm0, 8
1678
    psllq      mm2, 56
1679
    psrlq      mm1, 8
1680
    por        mm0, mm2
1681
    movq [r2+r3*1], mm0
1682
    movq       mm2, mm1
1683
    psrlq      mm0, 8
1684
    psllq      mm2, 56
1685
    psrlq      mm1, 8
1686
    por        mm0, mm2
1687
    movq [r1+r3*2], mm0
1688
    movq       mm2, mm1
1689
    psrlq      mm0, 8
1690
    psllq      mm2, 56
1691
    psrlq      mm1, 8
1692
    por        mm0, mm2
1693
    movq [r1+r3*1], mm0
1694
    movq       mm2, mm1
1695
    psrlq      mm0, 8
1696
    psllq      mm2, 56
1697
    psrlq      mm1, 8
1698
    por        mm0, mm2
1699
    movq [r0+r3*2], mm0
1700
    psrlq      mm0, 8
1701
    psllq      mm1, 56
1702
    por        mm0, mm1
1703
    movq [r0+r3*1], mm0
1704
    RET
1705

    
1706
%macro PRED8x8L_DOWN_RIGHT 1
1707
cglobal pred8x8l_down_right_%1, 4,5
1708
    sub          r0, r3
1709
    lea          r4, [r0+r3*2]
1710
    movq        mm0, [r0+r3*1-8]
1711
    punpckhbw   mm0, [r0+r3*0-8]
1712
    movq        mm1, [r4+r3*1-8]
1713
    punpckhbw   mm1, [r0+r3*2-8]
1714
    mov          r4, r0
1715
    punpckhwd   mm1, mm0
1716
    lea          r0, [r0+r3*4]
1717
    movq        mm2, [r0+r3*1-8]
1718
    punpckhbw   mm2, [r0+r3*0-8]
1719
    lea          r0, [r0+r3*2]
1720
    movq        mm3, [r0+r3*1-8]
1721
    punpckhbw   mm3, [r0+r3*0-8]
1722
    punpckhwd   mm3, mm2
1723
    punpckhdq   mm3, mm1
1724
    lea          r0, [r0+r3*2]
1725
    movq        mm0, [r0+r3*0-8]
1726
    movq        mm1, [r4]
1727
    mov          r0, r4
1728
    movq        mm4, mm3
1729
    movq        mm2, mm3
1730
    PALIGNR     mm4, mm0, 7, mm0
1731
    PALIGNR     mm1, mm2, 1, mm2
1732
    test        r1, r1
1733
    jz .fix_lt_1
1734
    jmp .do_left
1735
.fix_lt_1:
1736
    movq        mm5, mm3
1737
    pxor        mm5, mm4
1738
    psrlq       mm5, 56
1739
    psllq       mm5, 48
1740
    pxor        mm1, mm5
1741
    jmp .do_left
1742
.fix_lt_2:
1743
    movq        mm5, mm3
1744
    pxor        mm5, mm2
1745
    psllq       mm5, 56
1746
    psrlq       mm5, 56
1747
    pxor        mm2, mm5
1748
    test         r2, r2
1749
    jnz .do_top
1750
.fix_tr_1:
1751
    movq        mm5, mm3
1752
    pxor        mm5, mm1
1753
    psrlq       mm5, 56
1754
    psllq       mm5, 56
1755
    pxor        mm1, mm5
1756
    jmp .do_top
1757
.do_left:
1758
    movq        mm0, mm4
1759
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1760
    movq        mm4, mm0
1761
    movq        mm7, mm2
1762
    movq2dq    xmm3, mm2
1763
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1764
    psllq       mm1, 56
1765
    PALIGNR     mm7, mm1, 7, mm3
1766
    movq2dq    xmm1, mm7
1767
    movq        mm0, [r0-8]
1768
    movq        mm3, [r0]
1769
    movq        mm1, [r0+8]
1770
    movq        mm2, mm3
1771
    movq        mm4, mm3
1772
    PALIGNR     mm2, mm0, 7, mm0
1773
    PALIGNR     mm1, mm4, 1, mm4
1774
    test         r1, r1
1775
    jz .fix_lt_2
1776
    test         r2, r2
1777
    jz .fix_tr_1
1778
.do_top:
1779
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1780
    movq2dq   xmm4, mm4
1781
    lea         r1, [r0+r3*2]
1782
    movdqa    xmm0, xmm3
1783
    pslldq    xmm4, 8
1784
    por       xmm3, xmm4
1785
    lea         r2, [r1+r3*2]
1786
    pslldq    xmm4, 1
1787
    por       xmm1, xmm4
1788
    psrldq    xmm0, 7
1789
    pslldq    xmm0, 15
1790
    psrldq    xmm0, 7
1791
    por       xmm1, xmm0
1792
    lea         r0, [r2+r3*2]
1793
    movdqa    xmm2, xmm3
1794
    psrldq    xmm2, 1
1795
INIT_XMM
1796
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1797
    movdqa    xmm1, xmm0
1798
    psrldq    xmm1, 1
1799
    movq [r0+r3*2], xmm0
1800
    movq [r0+r3*1], xmm1
1801
    psrldq    xmm0, 2
1802
    psrldq    xmm1, 2
1803
    movq [r2+r3*2], xmm0
1804
    movq [r2+r3*1], xmm1
1805
    psrldq    xmm0, 2
1806
    psrldq    xmm1, 2
1807
    movq [r1+r3*2], xmm0
1808
    movq [r1+r3*1], xmm1
1809
    psrldq    xmm0, 2
1810
    psrldq    xmm1, 2
1811
    movq [r4+r3*2], xmm0
1812
    movq [r4+r3*1], xmm1
1813
    RET
1814
%endmacro
1815

    
1816
INIT_MMX
1817
%define PALIGNR PALIGNR_MMX
1818
PRED8x8L_DOWN_RIGHT sse2
1819
INIT_MMX
1820
%define PALIGNR PALIGNR_SSSE3
1821
PRED8x8L_DOWN_RIGHT ssse3
1822

    
1823
;-----------------------------------------------------------------------------
1824
; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
1825
;-----------------------------------------------------------------------------
1826

    
1827
INIT_MMX
1828
%define PALIGNR PALIGNR_MMX
1829
cglobal pred8x8l_vertical_right_mmxext, 4,5
1830
    sub          r0, r3
1831
    lea          r4, [r0+r3*2]
1832
    movq        mm0, [r0+r3*1-8]
1833
    punpckhbw   mm0, [r0+r3*0-8]
1834
    movq        mm1, [r4+r3*1-8]
1835
    punpckhbw   mm1, [r0+r3*2-8]
1836
    mov          r4, r0
1837
    punpckhwd   mm1, mm0
1838
    lea          r0, [r0+r3*4]
1839
    movq        mm2, [r0+r3*1-8]
1840
    punpckhbw   mm2, [r0+r3*0-8]
1841
    lea          r0, [r0+r3*2]
1842
    movq        mm3, [r0+r3*1-8]
1843
    punpckhbw   mm3, [r0+r3*0-8]
1844
    punpckhwd   mm3, mm2
1845
    punpckhdq   mm3, mm1
1846
    lea          r0, [r0+r3*2]
1847
    movq        mm0, [r0+r3*0-8]
1848
    movq        mm1, [r4]
1849
    mov          r0, r4
1850
    movq        mm4, mm3
1851
    movq        mm2, mm3
1852
    PALIGNR     mm4, mm0, 7, mm0
1853
    PALIGNR     mm1, mm2, 1, mm2
1854
    test        r1, r1
1855
    jz .fix_lt_1
1856
    jmp .do_left
1857
.fix_lt_1:
1858
    movq        mm5, mm3
1859
    pxor        mm5, mm4
1860
    psrlq       mm5, 56
1861
    psllq       mm5, 48
1862
    pxor        mm1, mm5
1863
    jmp .do_left
1864
.fix_lt_2:
1865
    movq        mm5, mm3
1866
    pxor        mm5, mm2
1867
    psllq       mm5, 56
1868
    psrlq       mm5, 56
1869
    pxor        mm2, mm5
1870
    test         r2, r2
1871
    jnz .do_top
1872
.fix_tr_1:
1873
    movq        mm5, mm3
1874
    pxor        mm5, mm1
1875
    psrlq       mm5, 56
1876
    psllq       mm5, 56
1877
    pxor        mm1, mm5
1878
    jmp .do_top
1879
.do_left:
1880
    movq        mm0, mm4
1881
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1882
    movq        mm7, mm2
1883
    movq        mm0, [r0-8]
1884
    movq        mm3, [r0]
1885
    movq        mm1, [r0+8]
1886
    movq        mm2, mm3
1887
    movq        mm4, mm3
1888
    PALIGNR     mm2, mm0, 7, mm0
1889
    PALIGNR     mm1, mm4, 1, mm4
1890
    test         r1, r1
1891
    jz .fix_lt_2
1892
    test         r2, r2
1893
    jz .fix_tr_1
1894
.do_top
1895
    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1896
    lea         r1, [r0+r3*2]
1897
    movq       mm2, mm6
1898
    movq       mm3, mm6
1899
    PALIGNR    mm3, mm7, 7, mm0
1900
    PALIGNR    mm6, mm7, 6, mm1
1901
    movq       mm4, mm3
1902
    pavgb      mm3, mm2
1903
    lea         r2, [r1+r3*2]
1904
    PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1905
    movq [r0+r3*1], mm3
1906
    movq [r0+r3*2], mm0
1907
    movq       mm5, mm0
1908
    movq       mm6, mm3
1909
    movq       mm1, mm7
1910
    movq       mm2, mm1
1911
    psllq      mm2, 8
1912
    movq       mm3, mm1
1913
    psllq      mm3, 16
1914
    lea         r4, [r2+r3*2]
1915
    PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1916
    PALIGNR    mm6, mm0, 7, mm2
1917
    movq [r1+r3*1], mm6
1918
    psllq      mm0, 8
1919
    PALIGNR    mm5, mm0, 7, mm1
1920
    movq [r1+r3*2], mm5
1921
    psllq      mm0, 8
1922
    PALIGNR    mm6, mm0, 7, mm2
1923
    movq [r2+r3*1], mm6
1924
    psllq      mm0, 8
1925
    PALIGNR    mm5, mm0, 7, mm1
1926
    movq [r2+r3*2], mm5
1927
    psllq      mm0, 8
1928
    PALIGNR    mm6, mm0, 7, mm2
1929
    movq [r4+r3*1], mm6
1930
    psllq      mm0, 8
1931
    PALIGNR    mm5, mm0, 7, mm1
1932
    movq [r4+r3*2], mm5
1933
    RET
1934

    
1935
%macro PRED8x8L_VERTICAL_RIGHT 1
1936
cglobal pred8x8l_vertical_right_%1, 4,5,7
1937
    sub          r0, r3
1938
    lea          r4, [r0+r3*2]
1939
    movq        mm0, [r0+r3*1-8]
1940
    punpckhbw   mm0, [r0+r3*0-8]
1941
    movq        mm1, [r4+r3*1-8]
1942
    punpckhbw   mm1, [r0+r3*2-8]
1943
    mov          r4, r0
1944
    punpckhwd   mm1, mm0
1945
    lea          r0, [r0+r3*4]
1946
    movq        mm2, [r0+r3*1-8]
1947
    punpckhbw   mm2, [r0+r3*0-8]
1948
    lea          r0, [r0+r3*2]
1949
    movq        mm3, [r0+r3*1-8]
1950
    punpckhbw   mm3, [r0+r3*0-8]
1951
    punpckhwd   mm3, mm2
1952
    punpckhdq   mm3, mm1
1953
    lea          r0, [r0+r3*2]
1954
    movq        mm0, [r0+r3*0-8]
1955
    movq        mm1, [r4]
1956
    mov          r0, r4
1957
    movq        mm4, mm3
1958
    movq        mm2, mm3
1959
    PALIGNR     mm4, mm0, 7, mm0
1960
    PALIGNR     mm1, mm2, 1, mm2
1961
    test        r1, r1
1962
    jnz .do_left
1963
.fix_lt_1:
1964
    movq        mm5, mm3
1965
    pxor        mm5, mm4
1966
    psrlq       mm5, 56
1967
    psllq       mm5, 48
1968
    pxor        mm1, mm5
1969
    jmp .do_left
1970
.fix_lt_2:
1971
    movq        mm5, mm3
1972
    pxor        mm5, mm2
1973
    psllq       mm5, 56
1974
    psrlq       mm5, 56
1975
    pxor        mm2, mm5
1976
    test         r2, r2
1977
    jnz .do_top
1978
.fix_tr_1:
1979
    movq        mm5, mm3
1980
    pxor        mm5, mm1
1981
    psrlq       mm5, 56
1982
    psllq       mm5, 56
1983
    pxor        mm1, mm5
1984
    jmp .do_top
1985
.do_left:
1986
    movq        mm0, mm4
1987
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1988
    movq2dq    xmm0, mm2
1989
    movq        mm0, [r0-8]
1990
    movq        mm3, [r0]
1991
    movq        mm1, [r0+8]
1992
    movq        mm2, mm3
1993
    movq        mm4, mm3
1994
    PALIGNR     mm2, mm0, 7, mm0
1995
    PALIGNR     mm1, mm4, 1, mm4
1996
    test         r1, r1
1997
    jz .fix_lt_2
1998
    test         r2, r2
1999
    jz .fix_tr_1
2000
.do_top
2001
    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
2002
    lea           r1, [r0+r3*2]
2003
    movq2dq     xmm4, mm6
2004
    pslldq      xmm4, 8
2005
    por         xmm0, xmm4
2006
    movdqa      xmm6, [pw_ff00]
2007
    movdqa      xmm1, xmm0
2008
    lea           r2, [r1+r3*2]
2009
    movdqa      xmm2, xmm0
2010
    movdqa      xmm3, xmm0
2011
    pslldq      xmm0, 1
2012
    pslldq      xmm1, 2
2013
    pavgb       xmm2, xmm0
2014
INIT_XMM
2015
    PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
2016
    pandn       xmm6, xmm4
2017
    movdqa      xmm5, xmm4
2018
    psrlw       xmm4, 8
2019
    packuswb    xmm6, xmm4
2020
    movhlps     xmm4, xmm6
2021
    movhps [r0+r3*2], xmm5
2022
    movhps [r0+r3*1], xmm2
2023
    psrldq      xmm5, 4
2024
    movss       xmm5, xmm6
2025
    psrldq      xmm2, 4
2026
    movss       xmm2, xmm4
2027
    lea           r0, [r2+r3*2]
2028
    psrldq      xmm5, 1
2029
    psrldq      xmm2, 1
2030
    movq        [r0+r3*2], xmm5
2031
    movq        [r0+r3*1], xmm2
2032
    psrldq      xmm5, 1
2033
    psrldq      xmm2, 1
2034
    movq        [r2+r3*2], xmm5
2035
    movq        [r2+r3*1], xmm2
2036
    psrldq      xmm5, 1
2037
    psrldq      xmm2, 1
2038
    movq        [r1+r3*2], xmm5
2039
    movq        [r1+r3*1], xmm2
2040
    RET
2041
%endmacro
2042

    
2043
INIT_MMX
2044
%define PALIGNR PALIGNR_MMX
2045
PRED8x8L_VERTICAL_RIGHT sse2
2046
INIT_MMX
2047
%define PALIGNR PALIGNR_SSSE3
2048
PRED8x8L_VERTICAL_RIGHT ssse3
2049

    
2050
;-----------------------------------------------------------------------------
2051
;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
2052
;-----------------------------------------------------------------------------
2053

    
2054
%macro PRED8x8L_VERTICAL_LEFT 1
2055
cglobal pred8x8l_vertical_left_%1, 4,4
2056
    sub          r0, r3
2057
    movq        mm0, [r0-8]
2058
    movq        mm3, [r0]
2059
    movq        mm1, [r0+8]
2060
    movq        mm2, mm3
2061
    movq        mm4, mm3
2062
    PALIGNR     mm2, mm0, 7, mm0
2063
    PALIGNR     mm1, mm4, 1, mm4
2064
    test         r1, r1
2065
    jz .fix_lt_2
2066
    test         r2, r2
2067
    jz .fix_tr_1
2068
    jmp .do_top
2069
.fix_lt_2:
2070
    movq        mm5, mm3
2071
    pxor        mm5, mm2
2072
    psllq       mm5, 56
2073
    psrlq       mm5, 56
2074
    pxor        mm2, mm5
2075
    test         r2, r2
2076
    jnz .do_top
2077
.fix_tr_1:
2078
    movq        mm5, mm3
2079
    pxor        mm5, mm1
2080
    psrlq       mm5, 56
2081
    psllq       mm5, 56
2082
    pxor        mm1, mm5
2083
    jmp .do_top
2084
.fix_tr_2:
2085
    punpckhbw   mm3, mm3
2086
    pshufw      mm1, mm3, 0xFF
2087
    jmp .do_topright
2088
.do_top:
2089
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2090
    movq2dq    xmm4, mm4
2091
    test         r2, r2
2092
    jz .fix_tr_2
2093
    movq        mm0, [r0+8]
2094
    movq        mm5, mm0
2095
    movq        mm2, mm0
2096
    movq        mm4, mm0
2097
    psrlq       mm5, 56
2098
    PALIGNR     mm2, mm3, 7, mm3
2099
    PALIGNR     mm5, mm4, 1, mm4
2100
    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2101
.do_topright:
2102
    movq2dq   xmm3, mm1
2103
    lea         r1, [r0+r3*2]
2104
    pslldq    xmm3, 8
2105
    por       xmm4, xmm3
2106
    movdqa    xmm2, xmm4
2107
    movdqa    xmm1, xmm4
2108
    movdqa    xmm3, xmm4
2109
    psrldq    xmm2, 1
2110
    pslldq    xmm1, 1
2111
    pavgb     xmm3, xmm2
2112
    lea         r2, [r1+r3*2]
2113
INIT_XMM
2114
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2115
    psrldq    xmm0, 1
2116
    movq [r0+r3*1], xmm3
2117
    movq [r0+r3*2], xmm0
2118
    lea         r0, [r2+r3*2]
2119
    psrldq    xmm3, 1
2120
    psrldq    xmm0, 1
2121
    movq [r1+r3*1], xmm3
2122
    movq [r1+r3*2], xmm0
2123
    psrldq    xmm3, 1
2124
    psrldq    xmm0, 1
2125
    movq [r2+r3*1], xmm3
2126
    movq [r2+r3*2], xmm0
2127
    psrldq    xmm3, 1
2128
    psrldq    xmm0, 1
2129
    movq [r0+r3*1], xmm3
2130
    movq [r0+r3*2], xmm0
2131
    RET
2132
%endmacro
2133

    
2134
INIT_MMX
2135
%define PALIGNR PALIGNR_MMX
2136
PRED8x8L_VERTICAL_LEFT sse2
2137
%define PALIGNR PALIGNR_SSSE3
2138
INIT_MMX
2139
PRED8x8L_VERTICAL_LEFT ssse3
2140

    
2141
;-----------------------------------------------------------------------------
2142
; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
2143
;-----------------------------------------------------------------------------
2144

    
2145
%macro PRED8x8L_HORIZONTAL_UP 1
2146
cglobal pred8x8l_horizontal_up_%1, 4,4
2147
    sub          r0, r3
2148
    lea          r2, [r0+r3*2]
2149
    movq        mm0, [r0+r3*1-8]
2150
    test         r1, r1
2151
    lea          r1, [r0+r3]
2152
    cmovnz       r1, r0
2153
    punpckhbw   mm0, [r1+r3*0-8]
2154
    movq        mm1, [r2+r3*1-8]
2155
    punpckhbw   mm1, [r0+r3*2-8]
2156
    mov          r2, r0
2157
    punpckhwd   mm1, mm0
2158
    lea          r0, [r0+r3*4]
2159
    movq        mm2, [r0+r3*1-8]
2160
    punpckhbw   mm2, [r0+r3*0-8]
2161
    lea          r0, [r0+r3*2]
2162
    movq        mm3, [r0+r3*1-8]
2163
    punpckhbw   mm3, [r0+r3*0-8]
2164
    punpckhwd   mm3, mm2
2165
    punpckhdq   mm3, mm1
2166
    lea          r0, [r0+r3*2]
2167
    movq        mm0, [r0+r3*0-8]
2168
    movq        mm1, [r1+r3*0-8]
2169
    mov          r0, r2
2170
    movq        mm4, mm3
2171
    movq        mm2, mm3
2172
    PALIGNR     mm4, mm0, 7, mm0
2173
    PALIGNR     mm1, mm2, 1, mm2
2174
    movq       mm0, mm4
2175
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2176
    movq       mm4, mm0
2177
    movq       mm7, mm2
2178
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2179
    psllq      mm1, 56
2180
    PALIGNR    mm7, mm1, 7, mm3
2181
    lea         r1, [r0+r3*2]
2182
    pshufw     mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2183
    psllq      mm7, 56             ; l7 .. .. .. .. .. .. ..
2184
    movq       mm2, mm0
2185
    psllw      mm0, 8
2186
    psrlw      mm2, 8
2187
    por        mm2, mm0            ; l7 l6 l5 l4 l3 l2 l1 l0
2188
    movq       mm3, mm2
2189
    movq       mm4, mm2
2190
    movq       mm5, mm2
2191
    psrlq      mm2, 8
2192
    psrlq      mm3, 16
2193
    lea         r2, [r1+r3*2]
2194
    por        mm2, mm7            ; l7 l7 l6 l5 l4 l3 l2 l1
2195
    punpckhbw  mm7, mm7
2196
    por        mm3, mm7            ; l7 l7 l7 l6 l5 l4 l3 l2
2197
    pavgb      mm4, mm2
2198
    PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2199
    movq       mm5, mm4
2200
    punpcklbw  mm4, mm1            ; p4 p3 p2 p1
2201
    punpckhbw  mm5, mm1            ; p8 p7 p6 p5
2202
    movq       mm6, mm5
2203
    movq       mm7, mm5
2204
    movq       mm0, mm5
2205
    PALIGNR    mm5, mm4, 2, mm1
2206
    pshufw     mm1, mm6, 11111001b
2207
    PALIGNR    mm6, mm4, 4, mm2
2208
    pshufw     mm2, mm7, 11111110b
2209
    PALIGNR    mm7, mm4, 6, mm3
2210
    pshufw     mm3, mm0, 11111111b
2211
    movq [r0+r3*1], mm4
2212
    movq [r0+r3*2], mm5
2213
    lea         r0, [r2+r3*2]
2214
    movq [r1+r3*1], mm6
2215
    movq [r1+r3*2], mm7
2216
    movq [r2+r3*1], mm0
2217
    movq [r2+r3*2], mm1
2218
    movq [r0+r3*1], mm2
2219
    movq [r0+r3*2], mm3
2220
    RET
2221
%endmacro
2222

    
2223
INIT_MMX
2224
%define PALIGNR PALIGNR_MMX
2225
PRED8x8L_HORIZONTAL_UP mmxext
2226
%define PALIGNR PALIGNR_SSSE3
2227
PRED8x8L_HORIZONTAL_UP ssse3
2228

    
2229
;-----------------------------------------------------------------------------
2230
;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
2231
;-----------------------------------------------------------------------------
2232

    
2233
INIT_MMX
2234
%define PALIGNR PALIGNR_MMX
2235
cglobal pred8x8l_horizontal_down_mmxext, 4,5
2236
    sub          r0, r3
2237
    lea          r4, [r0+r3*2]
2238
    movq        mm0, [r0+r3*1-8]
2239
    punpckhbw   mm0, [r0+r3*0-8]
2240
    movq        mm1, [r4+r3*1-8]
2241
    punpckhbw   mm1, [r0+r3*2-8]
2242
    mov          r4, r0
2243
    punpckhwd   mm1, mm0
2244
    lea          r0, [r0+r3*4]
2245
    movq        mm2, [r0+r3*1-8]
2246
    punpckhbw   mm2, [r0+r3*0-8]
2247
    lea          r0, [r0+r3*2]
2248
    movq        mm3, [r0+r3*1-8]
2249
    punpckhbw   mm3, [r0+r3*0-8]
2250
    punpckhwd   mm3, mm2
2251
    punpckhdq   mm3, mm1
2252
    lea          r0, [r0+r3*2]
2253
    movq        mm0, [r0+r3*0-8]
2254
    movq        mm1, [r4]
2255
    mov          r0, r4
2256
    movq        mm4, mm3
2257
    movq        mm2, mm3
2258
    PALIGNR     mm4, mm0, 7, mm0
2259
    PALIGNR     mm1, mm2, 1, mm2
2260
    test        r1, r1
2261
    jnz .do_left
2262
.fix_lt_1:
2263
    movq        mm5, mm3
2264
    pxor        mm5, mm4
2265
    psrlq       mm5, 56
2266
    psllq       mm5, 48
2267
    pxor        mm1, mm5
2268
    jmp .do_left
2269
.fix_lt_2:
2270
    movq        mm5, mm3
2271
    pxor        mm5, mm2
2272
    psllq       mm5, 56
2273
    psrlq       mm5, 56
2274
    pxor        mm2, mm5
2275
    test         r2, r2
2276
    jnz .do_top
2277
.fix_tr_1:
2278
    movq        mm5, mm3
2279
    pxor        mm5, mm1
2280
    psrlq       mm5, 56
2281
    psllq       mm5, 56
2282
    pxor        mm1, mm5
2283
    jmp .do_top
2284
.do_left:
2285
    movq        mm0, mm4
2286
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2287
    movq        mm4, mm0
2288
    movq        mm7, mm2
2289
    movq        mm6, mm2
2290
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2291
    psllq       mm1, 56
2292
    PALIGNR     mm7, mm1, 7, mm3
2293
    movq        mm0, [r0-8]
2294
    movq        mm3, [r0]
2295
    movq        mm1, [r0+8]
2296
    movq        mm2, mm3
2297
    movq        mm4, mm3
2298
    PALIGNR     mm2, mm0, 7, mm0
2299
    PALIGNR     mm1, mm4, 1, mm4
2300
    test         r1, r1
2301
    jz .fix_lt_2
2302
    test         r2, r2
2303
    jz .fix_tr_1
2304
.do_top:
2305
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2306
    movq       mm5, mm4
2307
    lea         r1, [r0+r3*2]
2308
    psllq      mm7, 56
2309
    movq       mm2, mm5
2310
    movq       mm3, mm6
2311
    movq       mm4, mm2
2312
    PALIGNR    mm2, mm6, 7, mm5
2313
    PALIGNR    mm6, mm7, 7, mm0
2314
    lea         r2, [r1+r3*2]
2315
    PALIGNR    mm4, mm3, 1, mm7
2316
    movq       mm5, mm3
2317
    pavgb      mm3, mm6
2318
    PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2319
    movq       mm4, mm2
2320
    movq       mm1, mm2
2321
    lea         r4, [r2+r3*2]
2322
    psrlq      mm4, 16
2323
    psrlq      mm1, 8
2324
    PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2325
    movq       mm7, mm3
2326
    punpcklbw  mm3, mm0
2327
    punpckhbw  mm7, mm0
2328
    movq       mm1, mm7
2329
    movq       mm0, mm7
2330
    movq       mm4, mm7
2331
    movq [r4+r3*2], mm3
2332
    PALIGNR    mm7, mm3, 2, mm5
2333
    movq [r4+r3*1], mm7
2334
    PALIGNR    mm1, mm3, 4, mm5
2335
    movq [r2+r3*2], mm1
2336
    PALIGNR    mm0, mm3, 6, mm3
2337
    movq [r2+r3*1], mm0
2338
    movq       mm2, mm6
2339
    movq       mm3, mm6
2340
    movq [r1+r3*2], mm4
2341
    PALIGNR    mm6, mm4, 2, mm5
2342
    movq [r1+r3*1], mm6
2343
    PALIGNR    mm2, mm4, 4, mm5
2344
    movq [r0+r3*2], mm2
2345
    PALIGNR    mm3, mm4, 6, mm4
2346
    movq [r0+r3*1], mm3
2347
    RET
2348

    
2349
%macro PRED8x8L_HORIZONTAL_DOWN 1
2350
cglobal pred8x8l_horizontal_down_%1, 4,5
2351
    sub          r0, r3
2352
    lea          r4, [r0+r3*2]
2353
    movq        mm0, [r0+r3*1-8]
2354
    punpckhbw   mm0, [r0+r3*0-8]
2355
    movq        mm1, [r4+r3*1-8]
2356
    punpckhbw   mm1, [r0+r3*2-8]
2357
    mov          r4, r0
2358
    punpckhwd   mm1, mm0
2359
    lea          r0, [r0+r3*4]
2360
    movq        mm2, [r0+r3*1-8]
2361
    punpckhbw   mm2, [r0+r3*0-8]
2362
    lea          r0, [r0+r3*2]
2363
    movq        mm3, [r0+r3*1-8]
2364
    punpckhbw   mm3, [r0+r3*0-8]
2365
    punpckhwd   mm3, mm2
2366
    punpckhdq   mm3, mm1
2367
    lea          r0, [r0+r3*2]
2368
    movq        mm0, [r0+r3*0-8]
2369
    movq        mm1, [r4]
2370
    mov          r0, r4
2371
    movq        mm4, mm3
2372
    movq        mm2, mm3
2373
    PALIGNR     mm4, mm0, 7, mm0
2374
    PALIGNR     mm1, mm2, 1, mm2
2375
    test        r1, r1
2376
    jnz .do_left
2377
.fix_lt_1:
2378
    movq        mm5, mm3
2379
    pxor        mm5, mm4
2380
    psrlq       mm5, 56
2381
    psllq       mm5, 48
2382
    pxor        mm1, mm5
2383
    jmp .do_left
2384
.fix_lt_2:
2385
    movq        mm5, mm3
2386
    pxor        mm5, mm2
2387
    psllq       mm5, 56
2388
    psrlq       mm5, 56
2389
    pxor        mm2, mm5
2390
    test         r2, r2
2391
    jnz .do_top
2392
.fix_tr_1:
2393
    movq        mm5, mm3
2394
    pxor        mm5, mm1
2395
    psrlq       mm5, 56
2396
    psllq       mm5, 56
2397
    pxor        mm1, mm5
2398
    jmp .do_top
2399
.fix_tr_2:
2400
    punpckhbw   mm3, mm3
2401
    pshufw      mm1, mm3, 0xFF
2402
    jmp .do_topright
2403
.do_left:
2404
    movq        mm0, mm4
2405
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2406
    movq2dq    xmm0, mm2
2407
    pslldq     xmm0, 8
2408
    movq        mm4, mm0
2409
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2410
    movq2dq    xmm2, mm1
2411
    pslldq     xmm2, 15
2412
    psrldq     xmm2, 8
2413
    por        xmm0, xmm2
2414
    movq        mm0, [r0-8]
2415
    movq        mm3, [r0]
2416
    movq        mm1, [r0+8]
2417
    movq        mm2, mm3
2418
    movq        mm4, mm3
2419
    PALIGNR     mm2, mm0, 7, mm0
2420
    PALIGNR     mm1, mm4, 1, mm4
2421
    test         r1, r1
2422
    jz .fix_lt_2
2423
    test         r2, r2
2424
    jz .fix_tr_1
2425
.do_top:
2426
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2427
    movq2dq    xmm1, mm4
2428
    test         r2, r2
2429
    jz .fix_tr_2
2430
    movq        mm0, [r0+8]
2431
    movq        mm5, mm0
2432
    movq        mm2, mm0
2433
    movq        mm4, mm0
2434
    psrlq       mm5, 56
2435
    PALIGNR     mm2, mm3, 7, mm3
2436
    PALIGNR     mm5, mm4, 1, mm4
2437
    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2438
.do_topright:
2439
    movq2dq    xmm5, mm1
2440
    pslldq     xmm5, 8
2441
    por        xmm1, xmm5
2442
INIT_XMM
2443
    lea         r2, [r4+r3*2]
2444
    movdqa    xmm2, xmm1
2445
    movdqa    xmm3, xmm1
2446
    PALIGNR   xmm1, xmm0, 7, xmm4
2447
    PALIGNR   xmm2, xmm0, 9, xmm5
2448
    lea         r1, [r2+r3*2]
2449
    PALIGNR   xmm3, xmm0, 8, xmm0
2450
    movdqa    xmm4, xmm1
2451
    pavgb     xmm4, xmm3
2452
    lea         r0, [r1+r3*2]
2453
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2454
    punpcklbw xmm4, xmm0
2455
    movhlps   xmm0, xmm4
2456
    movq   [r0+r3*2], xmm4
2457
    movq   [r2+r3*2], xmm0
2458
    psrldq xmm4, 2
2459
    psrldq xmm0, 2
2460
    movq   [r0+r3*1], xmm4
2461
    movq   [r2+r3*1], xmm0
2462
    psrldq xmm4, 2
2463
    psrldq xmm0, 2
2464
    movq   [r1+r3*2], xmm4
2465
    movq   [r4+r3*2], xmm0
2466
    psrldq xmm4, 2
2467
    psrldq xmm0, 2
2468
    movq   [r1+r3*1], xmm4
2469
    movq   [r4+r3*1], xmm0
2470
    RET
2471
%endmacro
2472

    
2473
INIT_MMX
2474
%define PALIGNR PALIGNR_MMX
2475
PRED8x8L_HORIZONTAL_DOWN sse2
2476
INIT_MMX
2477
%define PALIGNR PALIGNR_SSSE3
2478
PRED8x8L_HORIZONTAL_DOWN ssse3
2479
%endif
2480

    
2481
;-----------------------------------------------------------------------------
2482
; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2483
;-----------------------------------------------------------------------------
2484

    
2485
cglobal pred4x4_dc_mmxext, 3,5
2486
    pxor   mm7, mm7
2487
    mov     r4, r0
2488
    sub     r0, r2
2489
    movd   mm0, [r0]
2490
    psadbw mm0, mm7
2491
    movzx  r1d, byte [r0+r2*1-1]
2492
    movd   r3d, mm0
2493
    add    r3d, r1d
2494
    movzx  r1d, byte [r0+r2*2-1]
2495
    lea     r0, [r0+r2*2]
2496
    add    r3d, r1d
2497
    movzx  r1d, byte [r0+r2*1-1]
2498
    add    r3d, r1d
2499
    movzx  r1d, byte [r0+r2*2-1]
2500
    add    r3d, r1d
2501
    add    r3d, 4
2502
    shr    r3d, 3
2503
    imul   r3d, 0x01010101
2504
    mov   [r4+r2*0], r3d
2505
    mov   [r0+r2*0], r3d
2506
    mov   [r0+r2*1], r3d
2507
    mov   [r0+r2*2], r3d
2508
    RET
2509

    
2510
;-----------------------------------------------------------------------------
2511
; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2512
;-----------------------------------------------------------------------------
2513

    
2514
%macro PRED4x4_TM_MMX 1
2515
cglobal pred4x4_tm_vp8_%1, 3,6
2516
    sub        r0, r2
2517
    pxor      mm7, mm7
2518
    movd      mm0, [r0]
2519
    punpcklbw mm0, mm7
2520
    movzx     r4d, byte [r0-1]
2521
    mov       r5d, 2
2522
.loop:
2523
    movzx     r1d, byte [r0+r2*1-1]
2524
    movzx     r3d, byte [r0+r2*2-1]
2525
    sub       r1d, r4d
2526
    sub       r3d, r4d
2527
    movd      mm2, r1d
2528
    movd      mm4, r3d
2529
%ifidn %1, mmx
2530
    punpcklwd mm2, mm2
2531
    punpcklwd mm4, mm4
2532
    punpckldq mm2, mm2
2533
    punpckldq mm4, mm4
2534
%else
2535
    pshufw    mm2, mm2, 0
2536
    pshufw    mm4, mm4, 0
2537
%endif
2538
    paddw     mm2, mm0
2539
    paddw     mm4, mm0
2540
    packuswb  mm2, mm2
2541
    packuswb  mm4, mm4
2542
    movd [r0+r2*1], mm2
2543
    movd [r0+r2*2], mm4
2544
    lea        r0, [r0+r2*2]
2545
    dec       r5d
2546
    jg .loop
2547
    REP_RET
2548
%endmacro
2549

    
2550
PRED4x4_TM_MMX mmx
2551
PRED4x4_TM_MMX mmxext
2552

    
2553
cglobal pred4x4_tm_vp8_ssse3, 3,3
2554
    sub         r0, r2
2555
    movq       mm6, [tm_shuf]
2556
    pxor       mm1, mm1
2557
    movd       mm0, [r0]
2558
    punpcklbw  mm0, mm1
2559
    movd       mm7, [r0-4]
2560
    pshufb     mm7, mm6
2561
    lea         r1, [r0+r2*2]
2562
    movd       mm2, [r0+r2*1-4]
2563
    movd       mm3, [r0+r2*2-4]
2564
    movd       mm4, [r1+r2*1-4]
2565
    movd       mm5, [r1+r2*2-4]
2566
    pshufb     mm2, mm6
2567
    pshufb     mm3, mm6
2568
    pshufb     mm4, mm6
2569
    pshufb     mm5, mm6
2570
    psubw      mm2, mm7
2571
    psubw      mm3, mm7
2572
    psubw      mm4, mm7
2573
    psubw      mm5, mm7
2574
    paddw      mm2, mm0
2575
    paddw      mm3, mm0
2576
    paddw      mm4, mm0
2577
    paddw      mm5, mm0
2578
    packuswb   mm2, mm2
2579
    packuswb   mm3, mm3
2580
    packuswb   mm4, mm4
2581
    packuswb   mm5, mm5
2582
    movd [r0+r2*1], mm2
2583
    movd [r0+r2*2], mm3
2584
    movd [r1+r2*1], mm4
2585
    movd [r1+r2*2], mm5
2586
    RET
2587

    
2588
;-----------------------------------------------------------------------------
2589
; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2590
;-----------------------------------------------------------------------------
2591

    
2592
INIT_MMX
2593
cglobal pred4x4_vertical_vp8_mmxext, 3,3
2594
    sub       r0, r2
2595
    movd      m1, [r0-1]
2596
    movd      m0, [r0]
2597
    mova      m2, m0   ;t0 t1 t2 t3
2598
    punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2599
    lea       r1, [r0+r2*2]
2600
    psrlq     m0, 8    ;t1 t2 t3 t4
2601
    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2602
    movd [r0+r2*1], m3
2603
    movd [r0+r2*2], m3
2604
    movd [r1+r2*1], m3
2605
    movd [r1+r2*2], m3
2606
    RET
2607

    
2608
;-----------------------------------------------------------------------------
2609
; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2610
;-----------------------------------------------------------------------------
2611
%ifdef CONFIG_GPL
2612
INIT_MMX
2613
cglobal pred4x4_down_left_mmxext, 3,3
2614
    sub       r0, r2
2615
    movq      m1, [r0]
2616
    punpckldq m1, [r1]
2617
    movq      m2, m1
2618
    movq      m3, m1
2619
    movq      m4, m1
2620
    psllq     m1, 8
2621
    pxor      m2, m1
2622
    psrlq     m2, 8
2623
    pxor      m3, m2
2624
    PRED4x4_LOWPASS m0, m1, m3, m4, m5
2625
    lea       r1, [r0+r2*2]
2626
    psrlq     m0, 8
2627
    movd      [r0+r2*1], m0
2628
    psrlq     m0, 8
2629
    movd      [r0+r2*2], m0
2630
    psrlq     m0, 8
2631
    movd      [r1+r2*1], m0
2632
    psrlq     m0, 8
2633
    movd      [r1+r2*2], m0
2634
    RET
2635

    
2636
;-----------------------------------------------------------------------------
2637
; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2638
;-----------------------------------------------------------------------------
2639

    
2640
INIT_MMX
2641
cglobal pred4x4_vertical_left_mmxext, 3,3
2642
    sub       r0, r2
2643
    movq      m1, [r0]
2644
    punpckldq m1, [r1]
2645
    movq      m3, m1
2646
    movq      m2, m1
2647
    psrlq     m3, 8
2648
    psrlq     m2, 16
2649
    movq      m4, m3
2650
    pavgb     m4, m1
2651
    PRED4x4_LOWPASS m0, m1, m2, m3, m5
2652
    lea       r1, [r0+r2*2]
2653
    movh      [r0+r2*1], m4
2654
    movh      [r0+r2*2], m0
2655
    psrlq     m4, 8
2656
    psrlq     m0, 8
2657
    movh      [r1+r2*1], m4
2658
    movh      [r1+r2*2], m0
2659
    RET
2660

    
2661
;-----------------------------------------------------------------------------
2662
; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2663
;-----------------------------------------------------------------------------
2664

    
2665
INIT_MMX
2666
cglobal pred4x4_horizontal_up_mmxext, 3,3
2667
    sub       r0, r2
2668
    lea       r1, [r0+r2*2]
2669
    movd      m0, [r0+r2*1-4]
2670
    punpcklbw m0, [r0+r2*2-4]
2671
    movd      m1, [r1+r2*1-4]
2672
    punpcklbw m1, [r1+r2*2-4]
2673
    punpckhwd m0, m1
2674
    movq      m1, m0
2675
    punpckhbw m1, m1
2676
    pshufw    m1, m1, 0xFF
2677
    punpckhdq m0, m1
2678
    movq      m2, m0
2679
    movq      m3, m0
2680
    movq      m7, m0
2681
    psrlq     m2, 16
2682
    psrlq     m3, 8
2683
    pavgb     m7, m3
2684
    PRED4x4_LOWPASS m4, m0, m2, m3, m5
2685
    punpcklbw m7, m4
2686
    movd    [r0+r2*1], m7
2687
    psrlq    m7, 16
2688
    movd    [r0+r2*2], m7
2689
    psrlq    m7, 16
2690
    movd    [r1+r2*1], m7
2691
    movd    [r1+r2*2], m1
2692
    RET
2693

    
2694
;-----------------------------------------------------------------------------
2695
; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2696
;-----------------------------------------------------------------------------
2697

    
2698
INIT_MMX
2699
%define PALIGNR PALIGNR_MMX
2700
cglobal pred4x4_horizontal_down_mmxext, 3,3
2701
    sub       r0, r2
2702
    lea       r1, [r0+r2*2]
2703
    movh      m0, [r0-4]      ; lt ..
2704
    punpckldq m0, [r0]        ; t3 t2 t1 t0 lt .. .. ..
2705
    psllq     m0, 8           ; t2 t1 t0 lt .. .. .. ..
2706
    movd      m1, [r1+r2*2-4] ; l3
2707
    punpcklbw m1, [r1+r2*1-4] ; l2 l3
2708
    movd      m2, [r0+r2*2-4] ; l1
2709
    punpcklbw m2, [r0+r2*1-4] ; l0 l1
2710
    punpckhwd m1, m2          ; l0 l1 l2 l3
2711
    punpckhdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
2712
    movq      m0, m1
2713
    movq      m2, m1
2714
    movq      m5, m1
2715
    psrlq     m0, 16          ; .. .. t2 t1 t0 lt l0 l1
2716
    psrlq     m2, 8           ; .. t2 t1 t0 lt l0 l1 l2
2717
    pavgb     m5, m2
2718
    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2719
    punpcklbw m5, m3
2720
    psrlq     m3, 32
2721
    PALIGNR   m3, m5, 6, m4
2722
    movh      [r1+r2*2], m5
2723
    psrlq     m5, 16
2724
    movh      [r1+r2*1], m5
2725
    psrlq     m5, 16
2726
    movh      [r0+r2*2], m5
2727
    movh      [r0+r2*1], m3
2728
    RET
2729

    
2730
;-----------------------------------------------------------------------------
2731
; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2732
;-----------------------------------------------------------------------------
2733

    
2734
INIT_MMX
2735
%define PALIGNR PALIGNR_MMX
2736
cglobal pred4x4_vertical_right_mmxext, 3,3
2737
    sub     r0, r2
2738
    lea     r1, [r0+r2*2]
2739
    movh    m0, [r0]                    ; ........t3t2t1t0
2740
    movq    m5, m0
2741
    PALIGNR m0, [r0-8], 7, m1           ; ......t3t2t1t0lt
2742
    pavgb   m5, m0
2743
    PALIGNR m0, [r0+r2*1-8], 7, m1      ; ....t3t2t1t0ltl0
2744
    movq    m1, m0
2745
    PALIGNR m0, [r0+r2*2-8], 7, m2      ; ..t3t2t1t0ltl0l1
2746
    movq    m2, m0
2747
    PALIGNR m0, [r1+r2*1-8], 7, m3      ; t3t2t1t0ltl0l1l2
2748
    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2749
    movq    m1, m3
2750
    psrlq   m3, 16
2751
    psllq   m1, 48
2752
    movh    [r0+r2*1], m5
2753
    movh    [r0+r2*2], m3
2754
    PALIGNR m5, m1, 7, m2
2755
    psllq   m1, 8
2756
    movh    [r1+r2*1], m5
2757
    PALIGNR m3, m1, 7, m1
2758
    movh    [r1+r2*2], m3
2759
    RET
2760

    
2761
;-----------------------------------------------------------------------------
2762
; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2763
;-----------------------------------------------------------------------------
2764

    
2765
INIT_MMX
2766
%define PALIGNR PALIGNR_MMX
2767
cglobal pred4x4_down_right_mmxext, 3,3
2768
    sub       r0, r2
2769
    lea       r1, [r0+r2*2]
2770
    movq      m1, [r1-8]
2771
    movq      m2, [r0+r2*1-8]
2772
    punpckhbw m2, [r0-8]
2773
    movh      m3, [r0]
2774
    punpckhwd m1, m2
2775
    PALIGNR   m3, m1, 5, m1
2776
    movq      m1, m3
2777
    PALIGNR   m3, [r1+r2*1-8], 7, m4
2778
    movq      m2, m3
2779
    PALIGNR   m3, [r1+r2*2-8], 7, m4
2780
    PRED4x4_LOWPASS m0, m3, m1, m2, m4
2781
    movh      [r1+r2*2], m0
2782
    psrlq     m0, 8
2783
    movh      [r1+r2*1], m0
2784
    psrlq     m0, 8
2785
    movh      [r0+r2*2], m0
2786
    psrlq     m0, 8
2787
    movh      [r0+r2*1], m0
2788
    RET
2789
%endif