Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_intrapred.asm @ 602a4cb2

History | View | Annotate | Download (46.4 KB)

1
;******************************************************************************
2
;* H.264 intra prediction asm optimizations
3
;* Copyright (c) 2010 Jason Garrett-Glaser
4
;* Copyright (c) 2010 Holger Lubitz
5
;* Copyright (c) 2010 Loren Merritt
6
;* Copyright (c) 2010 Ronald S. Bultje
7
;*
8
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or
11
;* modify it under the terms of the GNU Lesser General Public
12
;* License as published by the Free Software Foundation; either
13
;* version 2.1 of the License, or (at your option) any later version.
14
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
;* Lesser General Public License for more details.
19
;*
20
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with FFmpeg; if not, write to the Free Software
22
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
;******************************************************************************
24

    
25
%include "x86inc.asm"
26
%include "x86util.asm"
27

    
28
SECTION_RODATA
29

    
30
tm_shuf: times 8 db 0x03, 0x80
31
plane_shuf:  db -8, -7, -6, -5, -4, -3, -2, -1
32
             db  1,  2,  3,  4,  5,  6,  7,  8
33
plane8_shuf: db -4, -3, -2, -1,  0,  0,  0,  0
34
             db  1,  2,  3,  4,  0,  0,  0,  0
35
pw_0to7:     dw  0,  1,  2,  3,  4,  5,  6,  7
36
pw_1to8:     dw  1,  2,  3,  4,  5,  6,  7,  8
37
pw_m8tom1:   dw -8, -7, -6, -5, -4, -3, -2, -1
38
pw_m4to4:    dw -4, -3, -2, -1,  1,  2,  3,  4
39

    
40
SECTION .text
41

    
42
cextern pb_1
43
cextern pb_3
44
cextern pw_4
45
cextern pw_5
46
cextern pw_8
47
cextern pw_16
48
cextern pw_17
49
cextern pw_32
50

    
51
;-----------------------------------------------------------------------------
52
; void pred16x16_vertical(uint8_t *src, int stride)
53
;-----------------------------------------------------------------------------
54

    
55
cglobal pred16x16_vertical_mmx, 2,3
56
    sub   r0, r1
57
    mov   r2, 8
58
    movq mm0, [r0+0]
59
    movq mm1, [r0+8]
60
.loop:
61
    movq [r0+r1*1+0], mm0
62
    movq [r0+r1*1+8], mm1
63
    movq [r0+r1*2+0], mm0
64
    movq [r0+r1*2+8], mm1
65
    lea   r0, [r0+r1*2]
66
    dec   r2
67
    jg .loop
68
    REP_RET
69

    
70
cglobal pred16x16_vertical_sse, 2,3
71
    sub   r0, r1
72
    mov   r2, 4
73
    movaps xmm0, [r0]
74
.loop:
75
    movaps [r0+r1*1], xmm0
76
    movaps [r0+r1*2], xmm0
77
    lea   r0, [r0+r1*2]
78
    movaps [r0+r1*1], xmm0
79
    movaps [r0+r1*2], xmm0
80
    lea   r0, [r0+r1*2]
81
    dec   r2
82
    jg .loop
83
    REP_RET
84

    
85
;-----------------------------------------------------------------------------
86
; void pred16x16_horizontal(uint8_t *src, int stride)
87
;-----------------------------------------------------------------------------
88

    
89
%macro PRED16x16_H 1
90
cglobal pred16x16_horizontal_%1, 2,3
91
    mov       r2, 8
92
%ifidn %1, ssse3
93
    mova      m2, [pb_3]
94
%endif
95
.loop:
96
    movd      m0, [r0+r1*0-4]
97
    movd      m1, [r0+r1*1-4]
98

    
99
%ifidn %1, ssse3
100
    pshufb    m0, m2
101
    pshufb    m1, m2
102
%else
103
    punpcklbw m0, m0
104
    punpcklbw m1, m1
105
%ifidn %1, mmxext
106
    pshufw    m0, m0, 0xff
107
    pshufw    m1, m1, 0xff
108
%else
109
    punpckhwd m0, m0
110
    punpckhwd m1, m1
111
    punpckhdq m0, m0
112
    punpckhdq m1, m1
113
%endif
114
    mova [r0+r1*0+8], m0
115
    mova [r0+r1*1+8], m1
116
%endif
117

    
118
    mova [r0+r1*0], m0
119
    mova [r0+r1*1], m1
120
    lea       r0, [r0+r1*2]
121
    dec       r2
122
    jg .loop
123
    REP_RET
124
%endmacro
125

    
126
INIT_MMX
127
PRED16x16_H mmx
128
PRED16x16_H mmxext
129
INIT_XMM
130
PRED16x16_H ssse3
131

    
132
;-----------------------------------------------------------------------------
133
; void pred16x16_dc(uint8_t *src, int stride)
134
;-----------------------------------------------------------------------------
135

    
136
%macro PRED16x16_DC 1
137
cglobal pred16x16_dc_%1, 2,7
138
    mov       r4, r0
139
    sub       r0, r1
140
    pxor      mm0, mm0
141
    pxor      mm1, mm1
142
    psadbw    mm0, [r0+0]
143
    psadbw    mm1, [r0+8]
144
    dec        r0
145
    movzx     r5d, byte [r0+r1*1]
146
    paddw     mm0, mm1
147
    movd      r6d, mm0
148
    lea        r0, [r0+r1*2]
149
%rep 7
150
    movzx     r2d, byte [r0+r1*0]
151
    movzx     r3d, byte [r0+r1*1]
152
    add       r5d, r2d
153
    add       r6d, r3d
154
    lea        r0, [r0+r1*2]
155
%endrep
156
    movzx     r2d, byte [r0+r1*0]
157
    add       r5d, r6d
158
    lea       r2d, [r2+r5+16]
159
    shr       r2d, 5
160
%ifidn %1, mmxext
161
    movd       m0, r2d
162
    punpcklbw  m0, m0
163
    pshufw     m0, m0, 0
164
%elifidn %1, sse2
165
    movd       m0, r2d
166
    punpcklbw  m0, m0
167
    pshuflw    m0, m0, 0
168
    punpcklqdq m0, m0
169
%elifidn %1, ssse3
170
    pxor       m1, m1
171
    movd       m0, r2d
172
    pshufb     m0, m1
173
%endif
174

    
175
%if mmsize==8
176
    mov       r3d, 8
177
.loop:
178
    mova [r4+r1*0+0], m0
179
    mova [r4+r1*0+8], m0
180
    mova [r4+r1*1+0], m0
181
    mova [r4+r1*1+8], m0
182
%else
183
    mov       r3d, 4
184
.loop:
185
    mova [r4+r1*0], m0
186
    mova [r4+r1*1], m0
187
    lea   r4, [r4+r1*2]
188
    mova [r4+r1*0], m0
189
    mova [r4+r1*1], m0
190
%endif
191
    lea   r4, [r4+r1*2]
192
    dec   r3d
193
    jg .loop
194
    REP_RET
195
%endmacro
196

    
197
INIT_MMX
198
PRED16x16_DC mmxext
199
INIT_XMM
200
PRED16x16_DC   sse2
201
PRED16x16_DC  ssse3
202

    
203
;-----------------------------------------------------------------------------
204
; void pred16x16_tm_vp8(uint8_t *src, int stride)
205
;-----------------------------------------------------------------------------
206

    
207
%macro PRED16x16_TM_MMX 1
208
cglobal pred16x16_tm_vp8_%1, 2,5
209
    sub        r0, r1
210
    pxor      mm7, mm7
211
    movq      mm0, [r0+0]
212
    movq      mm2, [r0+8]
213
    movq      mm1, mm0
214
    movq      mm3, mm2
215
    punpcklbw mm0, mm7
216
    punpckhbw mm1, mm7
217
    punpcklbw mm2, mm7
218
    punpckhbw mm3, mm7
219
    movzx     r3d, byte [r0-1]
220
    mov       r4d, 16
221
.loop:
222
    movzx     r2d, byte [r0+r1-1]
223
    sub       r2d, r3d
224
    movd      mm4, r2d
225
%ifidn %1, mmx
226
    punpcklwd mm4, mm4
227
    punpckldq mm4, mm4
228
%else
229
    pshufw    mm4, mm4, 0
230
%endif
231
    movq      mm5, mm4
232
    movq      mm6, mm4
233
    movq      mm7, mm4
234
    paddw     mm4, mm0
235
    paddw     mm5, mm1
236
    paddw     mm6, mm2
237
    paddw     mm7, mm3
238
    packuswb  mm4, mm5
239
    packuswb  mm6, mm7
240
    movq [r0+r1+0], mm4
241
    movq [r0+r1+8], mm6
242
    add        r0, r1
243
    dec       r4d
244
    jg .loop
245
    REP_RET
246
%endmacro
247

    
248
PRED16x16_TM_MMX mmx
249
PRED16x16_TM_MMX mmxext
250

    
251
cglobal pred16x16_tm_vp8_sse2, 2,6,6
252
    sub          r0, r1
253
    pxor       xmm2, xmm2
254
    movdqa     xmm0, [r0]
255
    movdqa     xmm1, xmm0
256
    punpcklbw  xmm0, xmm2
257
    punpckhbw  xmm1, xmm2
258
    movzx       r4d, byte [r0-1]
259
    mov         r5d, 8
260
.loop:
261
    movzx       r2d, byte [r0+r1*1-1]
262
    movzx       r3d, byte [r0+r1*2-1]
263
    sub         r2d, r4d
264
    sub         r3d, r4d
265
    movd       xmm2, r2d
266
    movd       xmm4, r3d
267
    pshuflw    xmm2, xmm2, 0
268
    pshuflw    xmm4, xmm4, 0
269
    punpcklqdq xmm2, xmm2
270
    punpcklqdq xmm4, xmm4
271
    movdqa     xmm3, xmm2
272
    movdqa     xmm5, xmm4
273
    paddw      xmm2, xmm0
274
    paddw      xmm3, xmm1
275
    paddw      xmm4, xmm0
276
    paddw      xmm5, xmm1
277
    packuswb   xmm2, xmm3
278
    packuswb   xmm4, xmm5
279
    movdqa [r0+r1*1], xmm2
280
    movdqa [r0+r1*2], xmm4
281
    lea          r0, [r0+r1*2]
282
    dec         r5d
283
    jg .loop
284
    REP_RET
285

    
286
;-----------------------------------------------------------------------------
287
; void pred16x16_plane(uint8_t *src, int stride)
288
;-----------------------------------------------------------------------------
289

    
290
%macro H264_PRED16x16_PLANE 3
291
cglobal pred16x16_plane_%3_%1, 2, 7, %2
292
    mov          r2, r1           ; +stride
293
    neg          r1               ; -stride
294

    
295
    movh         m0, [r0+r1  -1]
296
%if mmsize == 8
297
    pxor         m4, m4
298
    movh         m1, [r0+r1  +3 ]
299
    movh         m2, [r0+r1  +8 ]
300
    movh         m3, [r0+r1  +12]
301
    punpcklbw    m0, m4
302
    punpcklbw    m1, m4
303
    punpcklbw    m2, m4
304
    punpcklbw    m3, m4
305
    pmullw       m0, [pw_m8tom1  ]
306
    pmullw       m1, [pw_m8tom1+8]
307
    pmullw       m2, [pw_1to8    ]
308
    pmullw       m3, [pw_1to8  +8]
309
    paddw        m0, m2
310
    paddw        m1, m3
311
%else ; mmsize == 16
312
%ifidn %1, sse2
313
    pxor         m2, m2
314
    movh         m1, [r0+r1  +8]
315
    punpcklbw    m0, m2
316
    punpcklbw    m1, m2
317
    pmullw       m0, [pw_m8tom1]
318
    pmullw       m1, [pw_1to8]
319
    paddw        m0, m1
320
%else ; ssse3
321
    movhps       m0, [r0+r1  +8]
322
    pmaddubsw    m0, [plane_shuf] ; H coefficients
323
%endif
324
    movhlps      m1, m0
325
%endif
326
    paddw        m0, m1
327
%ifidn %1, mmx
328
    mova         m1, m0
329
    psrlq        m1, 32
330
%elifidn %1, mmx2
331
    pshufw       m1, m0, 0xE
332
%else ; mmsize == 16
333
    pshuflw      m1, m0, 0xE
334
%endif
335
    paddw        m0, m1
336
%ifidn %1, mmx
337
    mova         m1, m0
338
    psrlq        m1, 16
339
%elifidn %1, mmx2
340
    pshufw       m1, m0, 0x1
341
%else
342
    pshuflw      m1, m0, 0x1
343
%endif
344
    paddw        m0, m1           ; sum of H coefficients
345

    
346
%ifidn %3, h264
347
    pmullw       m0, [pw_5]
348
    paddw        m0, [pw_32]
349
    psraw        m0, 6
350
%elifidn %3, rv40
351
    pmullw       m0, [pw_5]
352
    psraw        m0, 6
353
%elifidn %3, svq3
354
    movd        r3d, m0
355
    movsx        r3, r3w
356
    test         r3, r3
357
    lea          r4, [r3+3]
358
    cmovs        r3, r4
359
    sar          r3, 2           ; H/4
360
    lea          r3, [r3*5]      ; 5*(H/4)
361
    test         r3, r3
362
    lea          r4, [r3+15]
363
    cmovs        r3, r4
364
    sar          r3, 4           ; (5*(H/4))/16
365
    movd         m0, r3d
366
%endif
367

    
368
    lea          r4, [r0+r2*8-1]
369
    lea          r3, [r0+r2*4-1]
370
    add          r4, r2
371

    
372
%ifdef ARCH_X86_64
373
%define e_reg r11
374
%else
375
%define e_reg r0
376
%endif
377

    
378
    movzx     e_reg, byte [r3+r2*2   ]
379
    movzx        r5, byte [r4+r1     ]
380
    sub          r5, e_reg
381

    
382
    movzx     e_reg, byte [r3+r2     ]
383
    movzx        r6, byte [r4        ]
384
    sub          r6, e_reg
385
    lea          r5, [r5+r6*2]
386

    
387
    movzx     e_reg, byte [r3+r1     ]
388
    movzx        r6, byte [r4+r2*2   ]
389
    sub          r6, e_reg
390
    lea          r5, [r5+r6*4]
391

    
392
    movzx     e_reg, byte [r3        ]
393
%ifdef ARCH_X86_64
394
    movzx       r10, byte [r4+r2     ]
395
    sub         r10, e_reg
396
%else
397
    movzx        r6, byte [r4+r2     ]
398
    sub          r6, e_reg
399
    lea          r5, [r5+r6*4]
400
    sub          r5, r6
401
%endif
402

    
403
    lea       e_reg, [r3+r1*4]
404
    lea          r3, [r4+r2*4]
405

    
406
    movzx        r4, byte [e_reg+r2  ]
407
    movzx        r6, byte [r3        ]
408
    sub          r6, r4
409
%ifdef ARCH_X86_64
410
    lea          r6, [r10+r6*2]
411
    lea          r5, [r5+r6*2]
412
    add          r5, r6
413
%else
414
    lea          r5, [r5+r6*4]
415
    lea          r5, [r5+r6*2]
416
%endif
417

    
418
    movzx        r4, byte [e_reg     ]
419
%ifdef ARCH_X86_64
420
    movzx       r10, byte [r3   +r2  ]
421
    sub         r10, r4
422
    sub          r5, r10
423
%else
424
    movzx        r6, byte [r3   +r2  ]
425
    sub          r6, r4
426
    lea          r5, [r5+r6*8]
427
    sub          r5, r6
428
%endif
429

    
430
    movzx        r4, byte [e_reg+r1  ]
431
    movzx        r6, byte [r3   +r2*2]
432
    sub          r6, r4
433
%ifdef ARCH_X86_64
434
    add          r6, r10
435
%endif
436
    lea          r5, [r5+r6*8]
437

    
438
    movzx        r4, byte [e_reg+r2*2]
439
    movzx        r6, byte [r3   +r1  ]
440
    sub          r6, r4
441
    lea          r5, [r5+r6*4]
442
    add          r5, r6           ; sum of V coefficients
443

    
444
%ifndef ARCH_X86_64
445
    mov          r0, r0m
446
%endif
447

    
448
%ifidn %3, h264
449
    lea          r5, [r5*5+32]
450
    sar          r5, 6
451
%elifidn %3, rv40
452
    lea          r5, [r5*5]
453
    sar          r5, 6
454
%elifidn %3, svq3
455
    test         r5, r5
456
    lea          r6, [r5+3]
457
    cmovs        r5, r6
458
    sar          r5, 2            ; V/4
459
    lea          r5, [r5*5]       ; 5*(V/4)
460
    test         r5, r5
461
    lea          r6, [r5+15]
462
    cmovs        r5, r6
463
    sar          r5, 4            ; (5*(V/4))/16
464
%endif
465

    
466
    movzx        r4, byte [r0+r1  +15]
467
    movzx        r3, byte [r3+r2*2   ]
468
    lea          r3, [r3+r4+1]
469
    shl          r3, 4
470
    movd        r1d, m0
471
    movsx       r1d, r1w
472
    add         r1d, r5d
473
    add         r3d, r1d
474
    shl         r1d, 3
475
    sub         r3d, r1d          ; a
476

    
477
    movd         m1, r5d
478
    movd         m3, r3d
479
%ifidn %1, mmx
480
    punpcklwd    m0, m0
481
    punpcklwd    m1, m1
482
    punpcklwd    m3, m3
483
    punpckldq    m0, m0
484
    punpckldq    m1, m1
485
    punpckldq    m3, m3
486
%elifidn %1, mmx2
487
    pshufw       m0, m0, 0x0
488
    pshufw       m1, m1, 0x0
489
    pshufw       m3, m3, 0x0
490
%else
491
    pshuflw      m0, m0, 0x0
492
    pshuflw      m1, m1, 0x0
493
    pshuflw      m3, m3, 0x0
494
    punpcklqdq   m0, m0           ; splat H (words)
495
    punpcklqdq   m1, m1           ; splat V (words)
496
    punpcklqdq   m3, m3           ; splat a (words)
497
%endif
498
%ifidn %3, svq3
499
    SWAP          0, 1
500
%endif
501
    mova         m2, m0
502
%if mmsize == 8
503
    mova         m5, m0
504
%endif
505
    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
506
%if mmsize == 16
507
    psllw        m2, 3
508
%else
509
    psllw        m5, 3
510
    psllw        m2, 2
511
    mova         m6, m5
512
    paddw        m6, m2
513
%endif
514
    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
515
    paddw        m2, m0           ; a + {8,9,10,11,12,13,14,15}*H
516
%if mmsize == 8
517
    paddw        m5, m0           ; a + {8,9,10,11}*H
518
    paddw        m6, m0           ; a + {12,13,14,15}*H
519
%endif
520

    
521
    mov          r4, 8
522
.loop
523
    mova         m3, m0           ; b[0..7]
524
    mova         m4, m2           ; b[8..15]
525
    psraw        m3, 5
526
    psraw        m4, 5
527
    packuswb     m3, m4
528
    mova       [r0], m3
529
%if mmsize == 8
530
    mova         m3, m5           ; b[8..11]
531
    mova         m4, m6           ; b[12..15]
532
    psraw        m3, 5
533
    psraw        m4, 5
534
    packuswb     m3, m4
535
    mova     [r0+8], m3
536
%endif
537
    paddw        m0, m1
538
    paddw        m2, m1
539
%if mmsize == 8
540
    paddw        m5, m1
541
    paddw        m6, m1
542
%endif
543

    
544
    mova         m3, m0           ; b[0..7]
545
    mova         m4, m2           ; b[8..15]
546
    psraw        m3, 5
547
    psraw        m4, 5
548
    packuswb     m3, m4
549
    mova    [r0+r2], m3
550
%if mmsize == 8
551
    mova         m3, m5           ; b[8..11]
552
    mova         m4, m6           ; b[12..15]
553
    psraw        m3, 5
554
    psraw        m4, 5
555
    packuswb     m3, m4
556
    mova  [r0+r2+8], m3
557
%endif
558
    paddw        m0, m1
559
    paddw        m2, m1
560
%if mmsize == 8
561
    paddw        m5, m1
562
    paddw        m6, m1
563
%endif
564

    
565
    lea          r0, [r0+r2*2]
566
    dec          r4
567
    jg .loop
568
    REP_RET
569
%endmacro
570

    
571
INIT_MMX
572
H264_PRED16x16_PLANE mmx,   0, h264
573
H264_PRED16x16_PLANE mmx,   0, rv40
574
H264_PRED16x16_PLANE mmx,   0, svq3
575
H264_PRED16x16_PLANE mmx2,  0, h264
576
H264_PRED16x16_PLANE mmx2,  0, rv40
577
H264_PRED16x16_PLANE mmx2,  0, svq3
578
INIT_XMM
579
H264_PRED16x16_PLANE sse2,  8, h264
580
H264_PRED16x16_PLANE sse2,  8, rv40
581
H264_PRED16x16_PLANE sse2,  8, svq3
582
H264_PRED16x16_PLANE ssse3, 8, h264
583
H264_PRED16x16_PLANE ssse3, 8, rv40
584
H264_PRED16x16_PLANE ssse3, 8, svq3
585

    
586
;-----------------------------------------------------------------------------
587
; void pred8x8_plane(uint8_t *src, int stride)
588
;-----------------------------------------------------------------------------
589

    
590
%macro H264_PRED8x8_PLANE 2
591
cglobal pred8x8_plane_%1, 2, 7, %2
592
    mov          r2, r1           ; +stride
593
    neg          r1               ; -stride
594

    
595
    movd         m0, [r0+r1  -1]
596
%if mmsize == 8
597
    pxor         m2, m2
598
    movh         m1, [r0+r1  +4 ]
599
    punpcklbw    m0, m2
600
    punpcklbw    m1, m2
601
    pmullw       m0, [pw_m4to4]
602
    pmullw       m1, [pw_m4to4+8]
603
%else ; mmsize == 16
604
%ifidn %1, sse2
605
    pxor         m2, m2
606
    movd         m1, [r0+r1  +4]
607
    punpckldq    m0, m1
608
    punpcklbw    m0, m2
609
    pmullw       m0, [pw_m4to4]
610
%else ; ssse3
611
    movhps       m0, [r0+r1  +4]   ; this reads 4 bytes more than necessary
612
    pmaddubsw    m0, [plane8_shuf] ; H coefficients
613
%endif
614
    movhlps      m1, m0
615
%endif
616
    paddw        m0, m1
617

    
618
%ifnidn %1, ssse3
619
%ifidn %1, mmx
620
    mova         m1, m0
621
    psrlq        m1, 32
622
%elifidn %1, mmx2
623
    pshufw       m1, m0, 0xE
624
%else ; mmsize == 16
625
    pshuflw      m1, m0, 0xE
626
%endif
627
    paddw        m0, m1
628
%endif ; !ssse3
629

    
630
%ifidn %1, mmx
631
    mova         m1, m0
632
    psrlq        m1, 16
633
%elifidn %1, mmx2
634
    pshufw       m1, m0, 0x1
635
%else
636
    pshuflw      m1, m0, 0x1
637
%endif
638
    paddw        m0, m1           ; sum of H coefficients
639

    
640
    pmullw       m0, [pw_17]
641
    paddw        m0, [pw_16]
642
    psraw        m0, 5
643

    
644
    lea          r4, [r0+r2*4-1]
645
    lea          r3, [r0     -1]
646
    add          r4, r2
647

    
648
%ifdef ARCH_X86_64
649
%define e_reg r11
650
%else
651
%define e_reg r0
652
%endif
653

    
654
    movzx     e_reg, byte [r3+r2*2   ]
655
    movzx        r5, byte [r4+r1     ]
656
    sub          r5, e_reg
657

    
658
    movzx     e_reg, byte [r3        ]
659
%ifdef ARCH_X86_64
660
    movzx       r10, byte [r4+r2     ]
661
    sub         r10, e_reg
662
    sub          r5, r10
663
%else
664
    movzx        r6, byte [r4+r2     ]
665
    sub          r6, e_reg
666
    lea          r5, [r5+r6*4]
667
    sub          r5, r6
668
%endif
669

    
670
    movzx     e_reg, byte [r3+r1     ]
671
    movzx        r6, byte [r4+r2*2   ]
672
    sub          r6, e_reg
673
%ifdef ARCH_X86_64
674
    add          r6, r10
675
%endif
676
    lea          r5, [r5+r6*4]
677

    
678
    movzx     e_reg, byte [r3+r2     ]
679
    movzx        r6, byte [r4        ]
680
    sub          r6, e_reg
681
    lea          r6, [r5+r6*2]
682

    
683
    lea          r5, [r6*9+16]
684
    lea          r5, [r5+r6*8]
685
    sar          r5, 5
686

    
687
%ifndef ARCH_X86_64
688
    mov          r0, r0m
689
%endif
690

    
691
    movzx        r3, byte [r4+r2*2  ]
692
    movzx        r4, byte [r0+r1  +7]
693
    lea          r3, [r3+r4+1]
694
    shl          r3, 4
695
    movd        r1d, m0
696
    movsx       r1d, r1w
697
    add         r1d, r5d
698
    sub         r3d, r1d
699
    add         r1d, r1d
700
    sub         r3d, r1d          ; a
701

    
702
    movd         m1, r5d
703
    movd         m3, r3d
704
%ifidn %1, mmx
705
    punpcklwd    m0, m0
706
    punpcklwd    m1, m1
707
    punpcklwd    m3, m3
708
    punpckldq    m0, m0
709
    punpckldq    m1, m1
710
    punpckldq    m3, m3
711
%elifidn %1, mmx2
712
    pshufw       m0, m0, 0x0
713
    pshufw       m1, m1, 0x0
714
    pshufw       m3, m3, 0x0
715
%else
716
    pshuflw      m0, m0, 0x0
717
    pshuflw      m1, m1, 0x0
718
    pshuflw      m3, m3, 0x0
719
    punpcklqdq   m0, m0           ; splat H (words)
720
    punpcklqdq   m1, m1           ; splat V (words)
721
    punpcklqdq   m3, m3           ; splat a (words)
722
%endif
723
%if mmsize == 8
724
    mova         m2, m0
725
%endif
726
    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
727
    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
728
%if mmsize == 8
729
    psllw        m2, 2
730
    paddw        m2, m0           ; a + {4,5,6,7}*H
731
%endif
732

    
733
    mov          r4, 4
734
ALIGN 16
735
.loop
736
%if mmsize == 16
737
    mova         m3, m0           ; b[0..7]
738
    paddw        m0, m1
739
    psraw        m3, 5
740
    mova         m4, m0           ; V+b[0..7]
741
    paddw        m0, m1
742
    psraw        m4, 5
743
    packuswb     m3, m4
744
    movh       [r0], m3
745
    movhps  [r0+r2], m3
746
%else ; mmsize == 8
747
    mova         m3, m0           ; b[0..3]
748
    mova         m4, m2           ; b[4..7]
749
    paddw        m0, m1
750
    paddw        m2, m1
751
    psraw        m3, 5
752
    psraw        m4, 5
753
    mova         m5, m0           ; V+b[0..3]
754
    mova         m6, m2           ; V+b[4..7]
755
    paddw        m0, m1
756
    paddw        m2, m1
757
    psraw        m5, 5
758
    psraw        m6, 5
759
    packuswb     m3, m4
760
    packuswb     m5, m6
761
    mova       [r0], m3
762
    mova    [r0+r2], m5
763
%endif
764

    
765
    lea          r0, [r0+r2*2]
766
    dec          r4
767
    jg .loop
768
    REP_RET
769
%endmacro
770

    
771
INIT_MMX
772
H264_PRED8x8_PLANE mmx,   0
773
H264_PRED8x8_PLANE mmx2,  0
774
INIT_XMM
775
H264_PRED8x8_PLANE sse2,  8
776
H264_PRED8x8_PLANE ssse3, 8
777

    
778
;-----------------------------------------------------------------------------
779
; void pred8x8_vertical(uint8_t *src, int stride)
780
;-----------------------------------------------------------------------------
781

    
782
cglobal pred8x8_vertical_mmx, 2,2
783
    sub    r0, r1
784
    movq  mm0, [r0]
785
%rep 3
786
    movq [r0+r1*1], mm0
787
    movq [r0+r1*2], mm0
788
    lea    r0, [r0+r1*2]
789
%endrep
790
    movq [r0+r1*1], mm0
791
    movq [r0+r1*2], mm0
792
    RET
793

    
794
;-----------------------------------------------------------------------------
795
; void pred8x8_horizontal(uint8_t *src, int stride)
796
;-----------------------------------------------------------------------------
797

    
798
%macro PRED8x8_H 1
799
cglobal pred8x8_horizontal_%1, 2,3
800
    mov       r2, 4
801
%ifidn %1, ssse3
802
    mova      m2, [pb_3]
803
%endif
804
.loop:
805
    movd      m0, [r0+r1*0-4]
806
    movd      m1, [r0+r1*1-4]
807
%ifidn %1, ssse3
808
    pshufb    m0, m2
809
    pshufb    m1, m2
810
%else
811
    punpcklbw m0, m0
812
    punpcklbw m1, m1
813
%ifidn %1, mmxext
814
    pshufw    m0, m0, 0xff
815
    pshufw    m1, m1, 0xff
816
%else
817
    punpckhwd m0, m0
818
    punpckhwd m1, m1
819
    punpckhdq m0, m0
820
    punpckhdq m1, m1
821
%endif
822
%endif
823
    mova [r0+r1*0], m0
824
    mova [r0+r1*1], m1
825
    lea       r0, [r0+r1*2]
826
    dec       r2
827
    jg .loop
828
    REP_RET
829
%endmacro
830

    
831
INIT_MMX
832
PRED8x8_H mmx
833
PRED8x8_H mmxext
834
PRED8x8_H ssse3
835

    
836
;-----------------------------------------------------------------------------
837
; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
838
;-----------------------------------------------------------------------------
839
%ifdef CONFIG_GPL
840
cglobal pred8x8_top_dc_mmxext, 2,5
841
    sub         r0, r1
842
    movq       mm0, [r0]
843
    pxor       mm1, mm1
844
    pxor       mm2, mm2
845
    lea         r2, [r0+r1*2]
846
    punpckhbw  mm1, mm0
847
    punpcklbw  mm0, mm2
848
    psadbw     mm1, mm2        ; s1
849
    lea         r3, [r2+r1*2]
850
    psadbw     mm0, mm2        ; s0
851
    psrlw      mm1, 1
852
    psrlw      mm0, 1
853
    pavgw      mm1, mm2
854
    lea         r4, [r3+r1*2]
855
    pavgw      mm0, mm2
856
    pshufw     mm1, mm1, 0
857
    pshufw     mm0, mm0, 0     ; dc0 (w)
858
    packuswb   mm0, mm1        ; dc0,dc1 (b)
859
    movq [r0+r1*1], mm0
860
    movq [r0+r1*2], mm0
861
    lea         r0, [r3+r1*2]
862
    movq [r2+r1*1], mm0
863
    movq [r2+r1*2], mm0
864
    movq [r3+r1*1], mm0
865
    movq [r3+r1*2], mm0
866
    movq [r0+r1*1], mm0
867
    movq [r0+r1*2], mm0
868
    RET
869
%endif
870

    
871
;-----------------------------------------------------------------------------
872
; void pred8x8_dc_mmxext(uint8_t *src, int stride)
873
;-----------------------------------------------------------------------------
874
%ifdef CONFIG_GPL
875
INIT_MMX
876
cglobal pred8x8_dc_mmxext, 2,5
877
    sub       r0, r1
878
    pxor      m7, m7
879
    movd      m0, [r0+0]
880
    movd      m1, [r0+4]
881
    psadbw    m0, m7            ; s0
882
    mov       r4, r0
883
    psadbw    m1, m7            ; s1
884

    
885
    movzx    r2d, byte [r0+r1*1-1]
886
    movzx    r3d, byte [r0+r1*2-1]
887
    lea       r0, [r0+r1*2]
888
    add      r2d, r3d
889
    movzx    r3d, byte [r0+r1*1-1]
890
    add      r2d, r3d
891
    movzx    r3d, byte [r0+r1*2-1]
892
    add      r2d, r3d
893
    lea       r0, [r0+r1*2]
894
    movd      m2, r2d            ; s2
895
    movzx    r2d, byte [r0+r1*1-1]
896
    movzx    r3d, byte [r0+r1*2-1]
897
    lea       r0, [r0+r1*2]
898
    add      r2d, r3d
899
    movzx    r3d, byte [r0+r1*1-1]
900
    add      r2d, r3d
901
    movzx    r3d, byte [r0+r1*2-1]
902
    add      r2d, r3d
903
    movd      m3, r2d            ; s3
904

    
905
    punpcklwd m0, m1
906
    mov       r0, r4
907
    punpcklwd m2, m3
908
    punpckldq m0, m2            ; s0, s1, s2, s3
909
    pshufw    m3, m0, 11110110b ; s2, s1, s3, s3
910
    lea       r2, [r0+r1*2]
911
    pshufw    m0, m0, 01110100b ; s0, s1, s3, s1
912
    paddw     m0, m3
913
    lea       r3, [r2+r1*2]
914
    psrlw     m0, 2
915
    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
916
    lea       r4, [r3+r1*2]
917
    packuswb  m0, m0
918
    punpcklbw m0, m0
919
    movq      m1, m0
920
    punpcklbw m0, m0
921
    punpckhbw m1, m1
922
    movq [r0+r1*1], m0
923
    movq [r0+r1*2], m0
924
    movq [r2+r1*1], m0
925
    movq [r2+r1*2], m0
926
    movq [r3+r1*1], m1
927
    movq [r3+r1*2], m1
928
    movq [r4+r1*1], m1
929
    movq [r4+r1*2], m1
930
    RET
931
%endif
932

    
933
;-----------------------------------------------------------------------------
934
; void pred8x8_dc_rv40(uint8_t *src, int stride)
935
;-----------------------------------------------------------------------------
936

    
937
cglobal pred8x8_dc_rv40_mmxext, 2,7
938
    mov       r4, r0
939
    sub       r0, r1
940
    pxor      mm0, mm0
941
    psadbw    mm0, [r0]
942
    dec        r0
943
    movzx     r5d, byte [r0+r1*1]
944
    movd      r6d, mm0
945
    lea        r0, [r0+r1*2]
946
%rep 3
947
    movzx     r2d, byte [r0+r1*0]
948
    movzx     r3d, byte [r0+r1*1]
949
    add       r5d, r2d
950
    add       r6d, r3d
951
    lea        r0, [r0+r1*2]
952
%endrep
953
    movzx     r2d, byte [r0+r1*0]
954
    add       r5d, r6d
955
    lea       r2d, [r2+r5+8]
956
    shr       r2d, 4
957
    movd      mm0, r2d
958
    punpcklbw mm0, mm0
959
    pshufw    mm0, mm0, 0
960
    mov       r3d, 4
961
.loop:
962
    movq [r4+r1*0], mm0
963
    movq [r4+r1*1], mm0
964
    lea   r4, [r4+r1*2]
965
    dec   r3d
966
    jg .loop
967
    REP_RET
968

    
969
;-----------------------------------------------------------------------------
970
; void pred8x8_tm_vp8(uint8_t *src, int stride)
971
;-----------------------------------------------------------------------------
972

    
973
%macro PRED8x8_TM_MMX 1
974
cglobal pred8x8_tm_vp8_%1, 2,6
975
    sub        r0, r1
976
    pxor      mm7, mm7
977
    movq      mm0, [r0]
978
    movq      mm1, mm0
979
    punpcklbw mm0, mm7
980
    punpckhbw mm1, mm7
981
    movzx     r4d, byte [r0-1]
982
    mov       r5d, 4
983
.loop:
984
    movzx     r2d, byte [r0+r1*1-1]
985
    movzx     r3d, byte [r0+r1*2-1]
986
    sub       r2d, r4d
987
    sub       r3d, r4d
988
    movd      mm2, r2d
989
    movd      mm4, r3d
990
%ifidn %1, mmx
991
    punpcklwd mm2, mm2
992
    punpcklwd mm4, mm4
993
    punpckldq mm2, mm2
994
    punpckldq mm4, mm4
995
%else
996
    pshufw    mm2, mm2, 0
997
    pshufw    mm4, mm4, 0
998
%endif
999
    movq      mm3, mm2
1000
    movq      mm5, mm4
1001
    paddw     mm2, mm0
1002
    paddw     mm3, mm1
1003
    paddw     mm4, mm0
1004
    paddw     mm5, mm1
1005
    packuswb  mm2, mm3
1006
    packuswb  mm4, mm5
1007
    movq [r0+r1*1], mm2
1008
    movq [r0+r1*2], mm4
1009
    lea        r0, [r0+r1*2]
1010
    dec       r5d
1011
    jg .loop
1012
    REP_RET
1013
%endmacro
1014

    
1015
PRED8x8_TM_MMX mmx
1016
PRED8x8_TM_MMX mmxext
1017

    
1018
cglobal pred8x8_tm_vp8_sse2, 2,6,4
1019
    sub          r0, r1
1020
    pxor       xmm1, xmm1
1021
    movq       xmm0, [r0]
1022
    punpcklbw  xmm0, xmm1
1023
    movzx       r4d, byte [r0-1]
1024
    mov         r5d, 4
1025
.loop:
1026
    movzx       r2d, byte [r0+r1*1-1]
1027
    movzx       r3d, byte [r0+r1*2-1]
1028
    sub         r2d, r4d
1029
    sub         r3d, r4d
1030
    movd       xmm2, r2d
1031
    movd       xmm3, r3d
1032
    pshuflw    xmm2, xmm2, 0
1033
    pshuflw    xmm3, xmm3, 0
1034
    punpcklqdq xmm2, xmm2
1035
    punpcklqdq xmm3, xmm3
1036
    paddw      xmm2, xmm0
1037
    paddw      xmm3, xmm0
1038
    packuswb   xmm2, xmm3
1039
    movq   [r0+r1*1], xmm2
1040
    movhps [r0+r1*2], xmm2
1041
    lea          r0, [r0+r1*2]
1042
    dec         r5d
1043
    jg .loop
1044
    REP_RET
1045

    
1046
cglobal pred8x8_tm_vp8_ssse3, 2,3,6
1047
    sub          r0, r1
1048
    movdqa     xmm4, [tm_shuf]
1049
    pxor       xmm1, xmm1
1050
    movq       xmm0, [r0]
1051
    punpcklbw  xmm0, xmm1
1052
    movd       xmm5, [r0-4]
1053
    pshufb     xmm5, xmm4
1054
    mov         r2d, 4
1055
.loop:
1056
    movd       xmm2, [r0+r1*1-4]
1057
    movd       xmm3, [r0+r1*2-4]
1058
    pshufb     xmm2, xmm4
1059
    pshufb     xmm3, xmm4
1060
    psubw      xmm2, xmm5
1061
    psubw      xmm3, xmm5
1062
    paddw      xmm2, xmm0
1063
    paddw      xmm3, xmm0
1064
    packuswb   xmm2, xmm3
1065
    movq   [r0+r1*1], xmm2
1066
    movhps [r0+r1*2], xmm2
1067
    lea          r0, [r0+r1*2]
1068
    dec         r2d
1069
    jg .loop
1070
    REP_RET
1071

    
1072
; dest, left, right, src, tmp
1073
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1074
%macro PRED4x4_LOWPASS 5
1075
    mova    %5, %2
1076
    pavgb   %2, %3
1077
    pxor    %3, %5
1078
    mova    %1, %4
1079
    pand    %3, [pb_1]
1080
    psubusb %2, %3
1081
    pavgb   %1, %2
1082
%endmacro
1083

    
1084
;-----------------------------------------------------------------------------
1085
; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1086
;-----------------------------------------------------------------------------
1087
%ifdef CONFIG_GPL
1088
%macro PRED8x8L_TOP_DC 1
1089
cglobal pred8x8l_top_dc_%1, 4,4
1090
    sub          r0, r3
1091
    pxor        mm7, mm7
1092
    movq        mm0, [r0-8]
1093
    movq        mm3, [r0]
1094
    movq        mm1, [r0+8]
1095
    movq        mm2, mm3
1096
    movq        mm4, mm3
1097
    PALIGNR     mm2, mm0, 7, mm0
1098
    PALIGNR     mm1, mm4, 1, mm4
1099
    test         r1, r1 ; top_left
1100
    jz .fix_lt_2
1101
    test         r2, r2 ; top_right
1102
    jz .fix_tr_1
1103
    jmp .body
1104
.fix_lt_2:
1105
    movq        mm5, mm3
1106
    pxor        mm5, mm2
1107
    psllq       mm5, 56
1108
    psrlq       mm5, 56
1109
    pxor        mm2, mm5
1110
    test         r2, r2 ; top_right
1111
    jnz .body
1112
.fix_tr_1:
1113
    movq        mm5, mm3
1114
    pxor        mm5, mm1
1115
    psrlq       mm5, 56
1116
    psllq       mm5, 56
1117
    pxor        mm1, mm5
1118
.body
1119
    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1120
    psadbw   mm7, mm0
1121
    paddw    mm7, [pw_4]
1122
    psrlw    mm7, 3
1123
    pshufw   mm7, mm7, 0
1124
    packuswb mm7, mm7
1125
%rep 3
1126
    movq [r0+r3*1], mm7
1127
    movq [r0+r3*2], mm7
1128
    lea    r0, [r0+r3*2]
1129
%endrep
1130
    movq [r0+r3*1], mm7
1131
    movq [r0+r3*2], mm7
1132
    RET
1133
%endmacro
1134

    
1135
INIT_MMX
1136
%define PALIGNR PALIGNR_MMX
1137
PRED8x8L_TOP_DC mmxext
1138
%define PALIGNR PALIGNR_SSSE3
1139
PRED8x8L_TOP_DC ssse3
1140
%endif
1141

    
1142
;-----------------------------------------------------------------------------
1143
;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1144
;-----------------------------------------------------------------------------
1145
%ifdef CONFIG_GPL
1146
%macro PRED8x8L_DC 1
1147
cglobal pred8x8l_dc_%1, 4,5
1148
    sub          r0, r3
1149
    lea          r4, [r0+r3*2]
1150
    movq        mm0, [r0+r3*1-8]
1151
    punpckhbw   mm0, [r0+r3*0-8]
1152
    movq        mm1, [r4+r3*1-8]
1153
    punpckhbw   mm1, [r0+r3*2-8]
1154
    mov          r4, r0
1155
    punpckhwd   mm1, mm0
1156
    lea          r0, [r0+r3*4]
1157
    movq        mm2, [r0+r3*1-8]
1158
    punpckhbw   mm2, [r0+r3*0-8]
1159
    lea          r0, [r0+r3*2]
1160
    movq        mm3, [r0+r3*1-8]
1161
    punpckhbw   mm3, [r0+r3*0-8]
1162
    punpckhwd   mm3, mm2
1163
    punpckhdq   mm3, mm1
1164
    lea          r0, [r0+r3*2]
1165
    movq        mm0, [r0+r3*0-8]
1166
    movq        mm1, [r4]
1167
    mov          r0, r4
1168
    movq        mm4, mm3
1169
    movq        mm2, mm3
1170
    PALIGNR     mm4, mm0, 7, mm0
1171
    PALIGNR     mm1, mm2, 1, mm2
1172
    test        r1, r1
1173
    jnz .do_left
1174
.fix_lt_1:
1175
    movq        mm5, mm3
1176
    pxor        mm5, mm4
1177
    psrlq       mm5, 56
1178
    psllq       mm5, 48
1179
    pxor        mm1, mm5
1180
    jmp .do_left
1181
.fix_lt_2:
1182
    movq        mm5, mm3
1183
    pxor        mm5, mm2
1184
    psllq       mm5, 56
1185
    psrlq       mm5, 56
1186
    pxor        mm2, mm5
1187
    test         r2, r2
1188
    jnz .body
1189
.fix_tr_1:
1190
    movq        mm5, mm3
1191
    pxor        mm5, mm1
1192
    psrlq       mm5, 56
1193
    psllq       mm5, 56
1194
    pxor        mm1, mm5
1195
    jmp .body
1196
.do_left:
1197
    movq        mm0, mm4
1198
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1199
    movq        mm4, mm0
1200
    movq        mm7, mm2
1201
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1202
    psllq       mm1, 56
1203
    PALIGNR     mm7, mm1, 7, mm3
1204
    movq        mm0, [r0-8]
1205
    movq        mm3, [r0]
1206
    movq        mm1, [r0+8]
1207
    movq        mm2, mm3
1208
    movq        mm4, mm3
1209
    PALIGNR     mm2, mm0, 7, mm0
1210
    PALIGNR     mm1, mm4, 1, mm4
1211
    test         r1, r1
1212
    jz .fix_lt_2
1213
    test         r2, r2
1214
    jz .fix_tr_1
1215
.body
1216
    lea          r1, [r0+r3*2]
1217
    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1218
    pxor        mm0, mm0
1219
    pxor        mm1, mm1
1220
    lea          r2, [r1+r3*2]
1221
    psadbw      mm0, mm7
1222
    psadbw      mm1, mm6
1223
    paddw       mm0, [pw_8]
1224
    paddw       mm0, mm1
1225
    lea          r4, [r2+r3*2]
1226
    psrlw       mm0, 4
1227
    pshufw      mm0, mm0, 0
1228
    packuswb    mm0, mm0
1229
    movq [r0+r3*1], mm0
1230
    movq [r0+r3*2], mm0
1231
    movq [r1+r3*1], mm0
1232
    movq [r1+r3*2], mm0
1233
    movq [r2+r3*1], mm0
1234
    movq [r2+r3*2], mm0
1235
    movq [r4+r3*1], mm0
1236
    movq [r4+r3*2], mm0
1237
    RET
1238
%endmacro
1239
INIT_MMX
1240
%define PALIGNR PALIGNR_MMX
1241
PRED8x8L_DC mmxext
1242
%define PALIGNR PALIGNR_SSSE3
1243
PRED8x8L_DC ssse3
1244
%endif
1245

    
1246
;-----------------------------------------------------------------------------
1247
; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
1248
;-----------------------------------------------------------------------------
1249
%ifdef CONFIG_GPL
1250
%macro PRED8x8L_HORIZONTAL 1
1251
cglobal pred8x8l_horizontal_%1, 4,4
1252
    sub          r0, r3
1253
    lea          r2, [r0+r3*2]
1254
    movq        mm0, [r0+r3*1-8]
1255
    punpckhbw   mm0, [r0+r3*0-8]
1256
    movq        mm1, [r2+r3*1-8]
1257
    punpckhbw   mm1, [r0+r3*2-8]
1258
    mov          r2, r0
1259
    punpckhwd   mm1, mm0
1260
    lea          r0, [r0+r3*4]
1261
    movq        mm2, [r0+r3*1-8]
1262
    punpckhbw   mm2, [r0+r3*0-8]
1263
    lea          r0, [r0+r3*2]
1264
    movq        mm3, [r0+r3*1-8]
1265
    punpckhbw   mm3, [r0+r3*0-8]
1266
    punpckhwd   mm3, mm2
1267
    punpckhdq   mm3, mm1
1268
    lea          r0, [r0+r3*2]
1269
    movq        mm0, [r0+r3*0-8]
1270
    movq        mm1, [r2]
1271
    mov          r0, r2
1272
    movq        mm4, mm3
1273
    movq        mm2, mm3
1274
    PALIGNR     mm4, mm0, 7, mm0
1275
    PALIGNR     mm1, mm2, 1, mm2
1276
    test        r1, r1 ; top_left
1277
    jnz .do_left
1278
.fix_lt_1:
1279
    movq        mm5, mm3
1280
    pxor        mm5, mm4
1281
    psrlq       mm5, 56
1282
    psllq       mm5, 48
1283
    pxor        mm1, mm5
1284
.do_left:
1285
    movq        mm0, mm4
1286
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1287
    movq        mm4, mm0
1288
    movq        mm7, mm2
1289
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1290
    psllq       mm1, 56
1291
    PALIGNR     mm7, mm1, 7, mm3
1292
    movq        mm3, mm7
1293
    lea         r1, [r0+r3*2]
1294
    movq       mm7, mm3
1295
    punpckhbw  mm3, mm3
1296
    punpcklbw  mm7, mm7
1297
    pshufw     mm0, mm3, 0xff
1298
    pshufw     mm1, mm3, 0xaa
1299
    lea         r2, [r1+r3*2]
1300
    pshufw     mm2, mm3, 0x55
1301
    pshufw     mm3, mm3, 0x00
1302
    pshufw     mm4, mm7, 0xff
1303
    pshufw     mm5, mm7, 0xaa
1304
    pshufw     mm6, mm7, 0x55
1305
    pshufw     mm7, mm7, 0x00
1306
    movq [r0+r3*1], mm0
1307
    movq [r0+r3*2], mm1
1308
    movq [r1+r3*1], mm2
1309
    movq [r1+r3*2], mm3
1310
    movq [r2+r3*1], mm4
1311
    movq [r2+r3*2], mm5
1312
    lea         r0, [r2+r3*2]
1313
    movq [r0+r3*1], mm6
1314
    movq [r0+r3*2], mm7
1315
    RET
1316
%endmacro
1317

    
1318
INIT_MMX
1319
%define PALIGNR PALIGNR_MMX
1320
PRED8x8L_HORIZONTAL mmxext
1321
%define PALIGNR PALIGNR_SSSE3
1322
PRED8x8L_HORIZONTAL ssse3
1323
%endif
1324

    
1325
;-----------------------------------------------------------------------------
1326
; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
1327
;-----------------------------------------------------------------------------
1328
%ifdef CONFIG_GPL
1329
%macro PRED8x8L_VERTICAL 1
1330
cglobal pred8x8l_vertical_%1, 4,4
1331
    sub          r0, r3
1332
    movq        mm0, [r0-8]
1333
    movq        mm3, [r0]
1334
    movq        mm1, [r0+8]
1335
    movq        mm2, mm3
1336
    movq        mm4, mm3
1337
    PALIGNR     mm2, mm0, 7, mm0
1338
    PALIGNR     mm1, mm4, 1, mm4
1339
    test         r1, r1 ; top_left
1340
    jz .fix_lt_2
1341
    test         r2, r2 ; top_right
1342
    jz .fix_tr_1
1343
    jmp .body
1344
.fix_lt_2:
1345
    movq        mm5, mm3
1346
    pxor        mm5, mm2
1347
    psllq       mm5, 56
1348
    psrlq       mm5, 56
1349
    pxor        mm2, mm5
1350
    test         r2, r2 ; top_right
1351
    jnz .body
1352
.fix_tr_1:
1353
    movq        mm5, mm3
1354
    pxor        mm5, mm1
1355
    psrlq       mm5, 56
1356
    psllq       mm5, 56
1357
    pxor        mm1, mm5
1358
.body
1359
    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1360
%rep 3
1361
    movq [r0+r3*1], mm0
1362
    movq [r0+r3*2], mm0
1363
    lea    r0, [r0+r3*2]
1364
%endrep
1365
    movq [r0+r3*1], mm0
1366
    movq [r0+r3*2], mm0
1367
    RET
1368
%endmacro
1369

    
1370
INIT_MMX
1371
%define PALIGNR PALIGNR_MMX
1372
PRED8x8L_VERTICAL mmxext
1373
%define PALIGNR PALIGNR_SSSE3
1374
PRED8x8L_VERTICAL ssse3
1375
%endif
1376

    
1377
;-----------------------------------------------------------------------------
1378
;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1379
;-----------------------------------------------------------------------------
1380
%ifdef CONFIG_GPL
1381
%macro PRED8x8L_DOWN_LEFT 1
1382
cglobal pred8x8l_down_left_%1, 4,4
1383
    sub          r0, r3
1384
    movq        mm0, [r0-8]
1385
    movq        mm3, [r0]
1386
    movq        mm1, [r0+8]
1387
    movq        mm2, mm3
1388
    movq        mm4, mm3
1389
    PALIGNR     mm2, mm0, 7, mm0
1390
    PALIGNR     mm1, mm4, 1, mm4
1391
    test         r1, r1 ; top_left
1392
    jz .fix_lt_2
1393
    test         r2, r2 ; top_right
1394
    jz .fix_tr_1
1395
    jmp .do_top
1396
.fix_lt_2:
1397
    movq        mm5, mm3
1398
    pxor        mm5, mm2
1399
    psllq       mm5, 56
1400
    psrlq       mm5, 56
1401
    pxor        mm2, mm5
1402
    test         r2, r2 ; top_right
1403
    jnz .do_top
1404
.fix_tr_1:
1405
    movq        mm5, mm3
1406
    pxor        mm5, mm1
1407
    psrlq       mm5, 56
1408
    psllq       mm5, 56
1409
    pxor        mm1, mm5
1410
    jmp .do_top
1411
.fix_tr_2:
1412
    punpckhbw   mm3, mm3
1413
    pshufw      mm1, mm3, 0xFF
1414
    jmp .do_topright
1415
.do_top:
1416
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1417
    movq2dq    xmm3, mm4
1418
    test         r2, r2 ; top_right
1419
    jz .fix_tr_2
1420
    movq        mm0, [r0+8]
1421
    movq        mm5, mm0
1422
    movq        mm2, mm0
1423
    movq        mm4, mm0
1424
    psrlq       mm5, 56
1425
    PALIGNR     mm2, mm3, 7, mm3
1426
    PALIGNR     mm5, mm4, 1, mm4
1427
    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1428
.do_topright:
1429
    movq2dq    xmm4, mm1
1430
    psrlq       mm1, 56
1431
    movq2dq    xmm5, mm1
1432
    lea         r1, [r0+r3*2]
1433
    pslldq    xmm4, 8
1434
    por       xmm3, xmm4
1435
    movdqa    xmm2, xmm3
1436
    psrldq    xmm2, 1
1437
    pslldq    xmm5, 15
1438
    por       xmm2, xmm5
1439
    lea         r2, [r1+r3*2]
1440
    movdqa    xmm1, xmm3
1441
    pslldq    xmm1, 1
1442
INIT_XMM
1443
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1444
    psrldq    xmm0, 1
1445
    movq [r0+r3*1], xmm0
1446
    psrldq    xmm0, 1
1447
    movq [r0+r3*2], xmm0
1448
    psrldq    xmm0, 1
1449
    lea         r0, [r2+r3*2]
1450
    movq [r1+r3*1], xmm0
1451
    psrldq    xmm0, 1
1452
    movq [r1+r3*2], xmm0
1453
    psrldq    xmm0, 1
1454
    movq [r2+r3*1], xmm0
1455
    psrldq    xmm0, 1
1456
    movq [r2+r3*2], xmm0
1457
    psrldq    xmm0, 1
1458
    movq [r0+r3*1], xmm0
1459
    psrldq    xmm0, 1
1460
    movq [r0+r3*2], xmm0
1461
    RET
1462
%endmacro
1463

    
1464
INIT_MMX
1465
%define PALIGNR PALIGNR_MMX
1466
PRED8x8L_DOWN_LEFT sse2
1467
INIT_MMX
1468
%define PALIGNR PALIGNR_SSSE3
1469
PRED8x8L_DOWN_LEFT ssse3
1470
%endif
1471

    
1472
;-----------------------------------------------------------------------------
1473
;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
1474
;-----------------------------------------------------------------------------
1475
%ifdef CONFIG_GPL
1476
INIT_MMX
1477
%define PALIGNR PALIGNR_MMX
1478
cglobal pred8x8l_down_right_mmxext, 4,5
1479
    sub          r0, r3
1480
    lea          r4, [r0+r3*2]
1481
    movq        mm0, [r0+r3*1-8]
1482
    punpckhbw   mm0, [r0+r3*0-8]
1483
    movq        mm1, [r4+r3*1-8]
1484
    punpckhbw   mm1, [r0+r3*2-8]
1485
    mov          r4, r0
1486
    punpckhwd   mm1, mm0
1487
    lea          r0, [r0+r3*4]
1488
    movq        mm2, [r0+r3*1-8]
1489
    punpckhbw   mm2, [r0+r3*0-8]
1490
    lea          r0, [r0+r3*2]
1491
    movq        mm3, [r0+r3*1-8]
1492
    punpckhbw   mm3, [r0+r3*0-8]
1493
    punpckhwd   mm3, mm2
1494
    punpckhdq   mm3, mm1
1495
    lea          r0, [r0+r3*2]
1496
    movq        mm0, [r0+r3*0-8]
1497
    movq        mm1, [r4]
1498
    mov          r0, r4
1499
    movq        mm4, mm3
1500
    movq        mm2, mm3
1501
    PALIGNR     mm4, mm0, 7, mm0
1502
    PALIGNR     mm1, mm2, 1, mm2
1503
    test        r1, r1 ; top_left
1504
    jz .fix_lt_1
1505
.do_left:
1506
    movq        mm0, mm4
1507
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1508
    movq        mm4, mm0
1509
    movq        mm7, mm2
1510
    movq        mm6, mm2
1511
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1512
    psllq       mm1, 56
1513
    PALIGNR     mm7, mm1, 7, mm3
1514
    movq        mm0, [r0-8]
1515
    movq        mm3, [r0]
1516
    movq        mm1, [r0+8]
1517
    movq        mm2, mm3
1518
    movq        mm4, mm3
1519
    PALIGNR     mm2, mm0, 7, mm0
1520
    PALIGNR     mm1, mm4, 1, mm4
1521
    test         r1, r1 ; top_left
1522
    jz .fix_lt_2
1523
    test         r2, r2 ; top_right
1524
    jz .fix_tr_1
1525
.do_top:
1526
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1527
    movq        mm5, mm4
1528
    jmp .body
1529
.fix_lt_1:
1530
    movq        mm5, mm3
1531
    pxor        mm5, mm4
1532
    psrlq       mm5, 56
1533
    psllq       mm5, 48
1534
    pxor        mm1, mm5
1535
    jmp .do_left
1536
.fix_lt_2:
1537
    movq        mm5, mm3
1538
    pxor        mm5, mm2
1539
    psllq       mm5, 56
1540
    psrlq       mm5, 56
1541
    pxor        mm2, mm5
1542
    test         r2, r2 ; top_right
1543
    jnz .do_top
1544
.fix_tr_1:
1545
    movq        mm5, mm3
1546
    pxor        mm5, mm1
1547
    psrlq       mm5, 56
1548
    psllq       mm5, 56
1549
    pxor        mm1, mm5
1550
    jmp .do_top
1551
.body
1552
    lea         r1, [r0+r3*2]
1553
    movq       mm1, mm7
1554
    movq       mm7, mm5
1555
    movq       mm5, mm6
1556
    movq       mm2, mm7
1557
    lea         r2, [r1+r3*2]
1558
    PALIGNR    mm2, mm6, 1, mm0
1559
    movq       mm3, mm7
1560
    PALIGNR    mm3, mm6, 7, mm0
1561
    movq       mm4, mm7
1562
    lea         r4, [r2+r3*2]
1563
    psrlq      mm4, 8
1564
    PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1565
    PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1566
    movq [r4+r3*2], mm0
1567
    movq       mm2, mm1
1568
    psrlq      mm0, 8
1569
    psllq      mm2, 56
1570
    psrlq      mm1, 8
1571
    por        mm0, mm2
1572
    movq [r4+r3*1], mm0
1573
    movq       mm2, mm1
1574
    psrlq      mm0, 8
1575
    psllq      mm2, 56
1576
    psrlq      mm1, 8
1577
    por        mm0, mm2
1578
    movq [r2+r3*2], mm0
1579
    movq       mm2, mm1
1580
    psrlq      mm0, 8
1581
    psllq      mm2, 56
1582
    psrlq      mm1, 8
1583
    por        mm0, mm2
1584
    movq [r2+r3*1], mm0
1585
    movq       mm2, mm1
1586
    psrlq      mm0, 8
1587
    psllq      mm2, 56
1588
    psrlq      mm1, 8
1589
    por        mm0, mm2
1590
    movq [r1+r3*2], mm0
1591
    movq       mm2, mm1
1592
    psrlq      mm0, 8
1593
    psllq      mm2, 56
1594
    psrlq      mm1, 8
1595
    por        mm0, mm2
1596
    movq [r1+r3*1], mm0
1597
    movq       mm2, mm1
1598
    psrlq      mm0, 8
1599
    psllq      mm2, 56
1600
    psrlq      mm1, 8
1601
    por        mm0, mm2
1602
    movq [r0+r3*2], mm0
1603
    psrlq      mm0, 8
1604
    psllq      mm1, 56
1605
    por        mm0, mm1
1606
    movq [r0+r3*1], mm0
1607
    RET
1608

    
1609
%macro PRED8x8L_DOWN_RIGHT 1
1610
cglobal pred8x8l_down_right_%1, 4,5
1611
    sub          r0, r3
1612
    lea          r4, [r0+r3*2]
1613
    movq        mm0, [r0+r3*1-8]
1614
    punpckhbw   mm0, [r0+r3*0-8]
1615
    movq        mm1, [r4+r3*1-8]
1616
    punpckhbw   mm1, [r0+r3*2-8]
1617
    mov          r4, r0
1618
    punpckhwd   mm1, mm0
1619
    lea          r0, [r0+r3*4]
1620
    movq        mm2, [r0+r3*1-8]
1621
    punpckhbw   mm2, [r0+r3*0-8]
1622
    lea          r0, [r0+r3*2]
1623
    movq        mm3, [r0+r3*1-8]
1624
    punpckhbw   mm3, [r0+r3*0-8]
1625
    punpckhwd   mm3, mm2
1626
    punpckhdq   mm3, mm1
1627
    lea          r0, [r0+r3*2]
1628
    movq        mm0, [r0+r3*0-8]
1629
    movq        mm1, [r4]
1630
    mov          r0, r4
1631
    movq        mm4, mm3
1632
    movq        mm2, mm3
1633
    PALIGNR     mm4, mm0, 7, mm0
1634
    PALIGNR     mm1, mm2, 1, mm2
1635
    test        r1, r1
1636
    jz .fix_lt_1
1637
    jmp .do_left
1638
.fix_lt_1:
1639
    movq        mm5, mm3
1640
    pxor        mm5, mm4
1641
    psrlq       mm5, 56
1642
    psllq       mm5, 48
1643
    pxor        mm1, mm5
1644
    jmp .do_left
1645
.fix_lt_2:
1646
    movq        mm5, mm3
1647
    pxor        mm5, mm2
1648
    psllq       mm5, 56
1649
    psrlq       mm5, 56
1650
    pxor        mm2, mm5
1651
    test         r2, r2
1652
    jnz .do_top
1653
.fix_tr_1:
1654
    movq        mm5, mm3
1655
    pxor        mm5, mm1
1656
    psrlq       mm5, 56
1657
    psllq       mm5, 56
1658
    pxor        mm1, mm5
1659
    jmp .do_top
1660
.do_left:
1661
    movq        mm0, mm4
1662
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1663
    movq        mm4, mm0
1664
    movq        mm7, mm2
1665
    movq2dq    xmm3, mm2
1666
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1667
    psllq       mm1, 56
1668
    PALIGNR     mm7, mm1, 7, mm3
1669
    movq2dq    xmm1, mm7
1670
    movq        mm0, [r0-8]
1671
    movq        mm3, [r0]
1672
    movq        mm1, [r0+8]
1673
    movq        mm2, mm3
1674
    movq        mm4, mm3
1675
    PALIGNR     mm2, mm0, 7, mm0
1676
    PALIGNR     mm1, mm4, 1, mm4
1677
    test         r1, r1
1678
    jz .fix_lt_2
1679
    test         r2, r2
1680
    jz .fix_tr_1
1681
.do_top:
1682
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1683
    movq2dq   xmm4, mm4
1684
    lea         r1, [r0+r3*2]
1685
    movdqa    xmm0, xmm3
1686
    pslldq    xmm4, 8
1687
    por       xmm3, xmm4
1688
    lea         r2, [r1+r3*2]
1689
    pslldq    xmm4, 1
1690
    por       xmm1, xmm4
1691
    psrldq    xmm0, 7
1692
    pslldq    xmm0, 15
1693
    psrldq    xmm0, 7
1694
    por       xmm1, xmm0
1695
    lea         r0, [r2+r3*2]
1696
    movdqa    xmm2, xmm3
1697
    psrldq    xmm2, 1
1698
INIT_XMM
1699
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1700
    movdqa    xmm1, xmm0
1701
    psrldq    xmm1, 1
1702
    movq [r0+r3*2], xmm0
1703
    movq [r0+r3*1], xmm1
1704
    psrldq    xmm0, 2
1705
    psrldq    xmm1, 2
1706
    movq [r2+r3*2], xmm0
1707
    movq [r2+r3*1], xmm1
1708
    psrldq    xmm0, 2
1709
    psrldq    xmm1, 2
1710
    movq [r1+r3*2], xmm0
1711
    movq [r1+r3*1], xmm1
1712
    psrldq    xmm0, 2
1713
    psrldq    xmm1, 2
1714
    movq [r4+r3*2], xmm0
1715
    movq [r4+r3*1], xmm1
1716
    RET
1717
%endmacro
1718

    
1719
INIT_MMX
1720
%define PALIGNR PALIGNR_MMX
1721
PRED8x8L_DOWN_RIGHT sse2
1722
INIT_MMX
1723
%define PALIGNR PALIGNR_SSSE3
1724
PRED8x8L_DOWN_RIGHT ssse3
1725
%endif
1726

    
1727
;-----------------------------------------------------------------------------
1728
; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
1729
;-----------------------------------------------------------------------------
1730

    
1731
cglobal pred4x4_dc_mmxext, 3,5
1732
    pxor   mm7, mm7
1733
    mov     r4, r0
1734
    sub     r0, r2
1735
    movd   mm0, [r0]
1736
    psadbw mm0, mm7
1737
    movzx  r1d, byte [r0+r2*1-1]
1738
    movd   r3d, mm0
1739
    add    r3d, r1d
1740
    movzx  r1d, byte [r0+r2*2-1]
1741
    lea     r0, [r0+r2*2]
1742
    add    r3d, r1d
1743
    movzx  r1d, byte [r0+r2*1-1]
1744
    add    r3d, r1d
1745
    movzx  r1d, byte [r0+r2*2-1]
1746
    add    r3d, r1d
1747
    add    r3d, 4
1748
    shr    r3d, 3
1749
    imul   r3d, 0x01010101
1750
    mov   [r4+r2*0], r3d
1751
    mov   [r0+r2*0], r3d
1752
    mov   [r0+r2*1], r3d
1753
    mov   [r0+r2*2], r3d
1754
    RET
1755

    
1756
;-----------------------------------------------------------------------------
1757
; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
1758
;-----------------------------------------------------------------------------
1759

    
1760
%macro PRED4x4_TM_MMX 1
1761
cglobal pred4x4_tm_vp8_%1, 3,6
1762
    sub        r0, r2
1763
    pxor      mm7, mm7
1764
    movd      mm0, [r0]
1765
    punpcklbw mm0, mm7
1766
    movzx     r4d, byte [r0-1]
1767
    mov       r5d, 2
1768
.loop:
1769
    movzx     r1d, byte [r0+r2*1-1]
1770
    movzx     r3d, byte [r0+r2*2-1]
1771
    sub       r1d, r4d
1772
    sub       r3d, r4d
1773
    movd      mm2, r1d
1774
    movd      mm4, r3d
1775
%ifidn %1, mmx
1776
    punpcklwd mm2, mm2
1777
    punpcklwd mm4, mm4
1778
    punpckldq mm2, mm2
1779
    punpckldq mm4, mm4
1780
%else
1781
    pshufw    mm2, mm2, 0
1782
    pshufw    mm4, mm4, 0
1783
%endif
1784
    paddw     mm2, mm0
1785
    paddw     mm4, mm0
1786
    packuswb  mm2, mm2
1787
    packuswb  mm4, mm4
1788
    movd [r0+r2*1], mm2
1789
    movd [r0+r2*2], mm4
1790
    lea        r0, [r0+r2*2]
1791
    dec       r5d
1792
    jg .loop
1793
    REP_RET
1794
%endmacro
1795

    
1796
PRED4x4_TM_MMX mmx
1797
PRED4x4_TM_MMX mmxext
1798

    
1799
cglobal pred4x4_tm_vp8_ssse3, 3,3
1800
    sub         r0, r2
1801
    movq       mm6, [tm_shuf]
1802
    pxor       mm1, mm1
1803
    movd       mm0, [r0]
1804
    punpcklbw  mm0, mm1
1805
    movd       mm7, [r0-4]
1806
    pshufb     mm7, mm6
1807
    lea         r1, [r0+r2*2]
1808
    movd       mm2, [r0+r2*1-4]
1809
    movd       mm3, [r0+r2*2-4]
1810
    movd       mm4, [r1+r2*1-4]
1811
    movd       mm5, [r1+r2*2-4]
1812
    pshufb     mm2, mm6
1813
    pshufb     mm3, mm6
1814
    pshufb     mm4, mm6
1815
    pshufb     mm5, mm6
1816
    psubw      mm2, mm7
1817
    psubw      mm3, mm7
1818
    psubw      mm4, mm7
1819
    psubw      mm5, mm7
1820
    paddw      mm2, mm0
1821
    paddw      mm3, mm0
1822
    paddw      mm4, mm0
1823
    paddw      mm5, mm0
1824
    packuswb   mm2, mm2
1825
    packuswb   mm3, mm3
1826
    packuswb   mm4, mm4
1827
    packuswb   mm5, mm5
1828
    movd [r0+r2*1], mm2
1829
    movd [r0+r2*2], mm3
1830
    movd [r1+r2*1], mm4
1831
    movd [r1+r2*2], mm5
1832
    RET
1833

    
1834
;-----------------------------------------------------------------------------
1835
; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
1836
;-----------------------------------------------------------------------------
1837

    
1838
INIT_MMX
1839
cglobal pred4x4_vertical_vp8_mmxext, 3,3
1840
    sub       r0, r2
1841
    movd      m1, [r0-1]
1842
    movd      m0, [r0]
1843
    mova      m2, m0   ;t0 t1 t2 t3
1844
    punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
1845
    lea       r1, [r0+r2*2]
1846
    psrlq     m0, 8    ;t1 t2 t3 t4
1847
    PRED4x4_LOWPASS m3, m1, m0, m2, m4
1848
    movd [r0+r2*1], m3
1849
    movd [r0+r2*2], m3
1850
    movd [r1+r2*1], m3
1851
    movd [r1+r2*2], m3
1852
    RET
1853

    
1854
;-----------------------------------------------------------------------------
1855
; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
1856
;-----------------------------------------------------------------------------
1857
%ifdef CONFIG_GPL
1858
INIT_MMX
1859
cglobal pred4x4_down_left_mmxext, 3,3
1860
    sub       r0, r2
1861
    movq      m1, [r0]
1862
    punpckldq m1, [r1]
1863
    movq      m2, m1
1864
    movq      m3, m1
1865
    movq      m4, m1
1866
    psllq     m1, 8
1867
    pxor      m2, m1
1868
    psrlq     m2, 8
1869
    pxor      m3, m2
1870
    PRED4x4_LOWPASS m0, m1, m3, m4, m5
1871
    lea       r1, [r0+r2*2]
1872
    psrlq     m0, 8
1873
    movd      [r0+r2*1], m0
1874
    psrlq     m0, 8
1875
    movd      [r0+r2*2], m0
1876
    psrlq     m0, 8
1877
    movd      [r1+r2*1], m0
1878
    psrlq     m0, 8
1879
    movd      [r1+r2*2], m0
1880
    RET
1881
%endif