Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_intrapred.asm @ ecc7efbb

History | View | Annotate | Download (54.6 KB)

1
;******************************************************************************
2
;* H.264 intra prediction asm optimizations
3
;* Copyright (c) 2010 Jason Garrett-Glaser
4
;* Copyright (c) 2010 Holger Lubitz
5
;* Copyright (c) 2010 Loren Merritt
6
;* Copyright (c) 2010 Ronald S. Bultje
7
;*
8
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or
11
;* modify it under the terms of the GNU Lesser General Public
12
;* License as published by the Free Software Foundation; either
13
;* version 2.1 of the License, or (at your option) any later version.
14
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
;* Lesser General Public License for more details.
19
;*
20
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with FFmpeg; if not, write to the Free Software
22
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
;******************************************************************************
24

    
25
%include "x86inc.asm"
26
%include "x86util.asm"
27

    
28
SECTION_RODATA
29

    
30
tm_shuf: times 8 db 0x03, 0x80
31
pw_ff00: times 8 dw 0xff00
32
plane_shuf:  db -8, -7, -6, -5, -4, -3, -2, -1
33
             db  1,  2,  3,  4,  5,  6,  7,  8
34
plane8_shuf: db -4, -3, -2, -1,  0,  0,  0,  0
35
             db  1,  2,  3,  4,  0,  0,  0,  0
36
pw_0to7:     dw  0,  1,  2,  3,  4,  5,  6,  7
37
pw_1to8:     dw  1,  2,  3,  4,  5,  6,  7,  8
38
pw_m8tom1:   dw -8, -7, -6, -5, -4, -3, -2, -1
39
pw_m4to4:    dw -4, -3, -2, -1,  1,  2,  3,  4
40

    
41
SECTION .text
42

    
43
cextern pb_1
44
cextern pb_3
45
cextern pw_4
46
cextern pw_5
47
cextern pw_8
48
cextern pw_16
49
cextern pw_17
50
cextern pw_32
51

    
52
;-----------------------------------------------------------------------------
53
; void pred16x16_vertical(uint8_t *src, int stride)
54
;-----------------------------------------------------------------------------
55

    
56
cglobal pred16x16_vertical_mmx, 2,3
57
    sub   r0, r1
58
    mov   r2, 8
59
    movq mm0, [r0+0]
60
    movq mm1, [r0+8]
61
.loop:
62
    movq [r0+r1*1+0], mm0
63
    movq [r0+r1*1+8], mm1
64
    movq [r0+r1*2+0], mm0
65
    movq [r0+r1*2+8], mm1
66
    lea   r0, [r0+r1*2]
67
    dec   r2
68
    jg .loop
69
    REP_RET
70

    
71
cglobal pred16x16_vertical_sse, 2,3
72
    sub   r0, r1
73
    mov   r2, 4
74
    movaps xmm0, [r0]
75
.loop:
76
    movaps [r0+r1*1], xmm0
77
    movaps [r0+r1*2], xmm0
78
    lea   r0, [r0+r1*2]
79
    movaps [r0+r1*1], xmm0
80
    movaps [r0+r1*2], xmm0
81
    lea   r0, [r0+r1*2]
82
    dec   r2
83
    jg .loop
84
    REP_RET
85

    
86
;-----------------------------------------------------------------------------
87
; void pred16x16_horizontal(uint8_t *src, int stride)
88
;-----------------------------------------------------------------------------
89

    
90
%macro PRED16x16_H 1
91
cglobal pred16x16_horizontal_%1, 2,3
92
    mov       r2, 8
93
%ifidn %1, ssse3
94
    mova      m2, [pb_3]
95
%endif
96
.loop:
97
    movd      m0, [r0+r1*0-4]
98
    movd      m1, [r0+r1*1-4]
99

    
100
%ifidn %1, ssse3
101
    pshufb    m0, m2
102
    pshufb    m1, m2
103
%else
104
    punpcklbw m0, m0
105
    punpcklbw m1, m1
106
%ifidn %1, mmxext
107
    pshufw    m0, m0, 0xff
108
    pshufw    m1, m1, 0xff
109
%else
110
    punpckhwd m0, m0
111
    punpckhwd m1, m1
112
    punpckhdq m0, m0
113
    punpckhdq m1, m1
114
%endif
115
    mova [r0+r1*0+8], m0
116
    mova [r0+r1*1+8], m1
117
%endif
118

    
119
    mova [r0+r1*0], m0
120
    mova [r0+r1*1], m1
121
    lea       r0, [r0+r1*2]
122
    dec       r2
123
    jg .loop
124
    REP_RET
125
%endmacro
126

    
127
INIT_MMX
128
PRED16x16_H mmx
129
PRED16x16_H mmxext
130
INIT_XMM
131
PRED16x16_H ssse3
132

    
133
;-----------------------------------------------------------------------------
134
; void pred16x16_dc(uint8_t *src, int stride)
135
;-----------------------------------------------------------------------------
136

    
137
%macro PRED16x16_DC 1
138
cglobal pred16x16_dc_%1, 2,7
139
    mov       r4, r0
140
    sub       r0, r1
141
    pxor      mm0, mm0
142
    pxor      mm1, mm1
143
    psadbw    mm0, [r0+0]
144
    psadbw    mm1, [r0+8]
145
    dec        r0
146
    movzx     r5d, byte [r0+r1*1]
147
    paddw     mm0, mm1
148
    movd      r6d, mm0
149
    lea        r0, [r0+r1*2]
150
%rep 7
151
    movzx     r2d, byte [r0+r1*0]
152
    movzx     r3d, byte [r0+r1*1]
153
    add       r5d, r2d
154
    add       r6d, r3d
155
    lea        r0, [r0+r1*2]
156
%endrep
157
    movzx     r2d, byte [r0+r1*0]
158
    add       r5d, r6d
159
    lea       r2d, [r2+r5+16]
160
    shr       r2d, 5
161
%ifidn %1, mmxext
162
    movd       m0, r2d
163
    punpcklbw  m0, m0
164
    pshufw     m0, m0, 0
165
%elifidn %1, sse2
166
    movd       m0, r2d
167
    punpcklbw  m0, m0
168
    pshuflw    m0, m0, 0
169
    punpcklqdq m0, m0
170
%elifidn %1, ssse3
171
    pxor       m1, m1
172
    movd       m0, r2d
173
    pshufb     m0, m1
174
%endif
175

    
176
%if mmsize==8
177
    mov       r3d, 8
178
.loop:
179
    mova [r4+r1*0+0], m0
180
    mova [r4+r1*0+8], m0
181
    mova [r4+r1*1+0], m0
182
    mova [r4+r1*1+8], m0
183
%else
184
    mov       r3d, 4
185
.loop:
186
    mova [r4+r1*0], m0
187
    mova [r4+r1*1], m0
188
    lea   r4, [r4+r1*2]
189
    mova [r4+r1*0], m0
190
    mova [r4+r1*1], m0
191
%endif
192
    lea   r4, [r4+r1*2]
193
    dec   r3d
194
    jg .loop
195
    REP_RET
196
%endmacro
197

    
198
INIT_MMX
199
PRED16x16_DC mmxext
200
INIT_XMM
201
PRED16x16_DC   sse2
202
PRED16x16_DC  ssse3
203

    
204
;-----------------------------------------------------------------------------
205
; void pred16x16_tm_vp8(uint8_t *src, int stride)
206
;-----------------------------------------------------------------------------
207

    
208
%macro PRED16x16_TM_MMX 1
209
cglobal pred16x16_tm_vp8_%1, 2,5
210
    sub        r0, r1
211
    pxor      mm7, mm7
212
    movq      mm0, [r0+0]
213
    movq      mm2, [r0+8]
214
    movq      mm1, mm0
215
    movq      mm3, mm2
216
    punpcklbw mm0, mm7
217
    punpckhbw mm1, mm7
218
    punpcklbw mm2, mm7
219
    punpckhbw mm3, mm7
220
    movzx     r3d, byte [r0-1]
221
    mov       r4d, 16
222
.loop:
223
    movzx     r2d, byte [r0+r1-1]
224
    sub       r2d, r3d
225
    movd      mm4, r2d
226
%ifidn %1, mmx
227
    punpcklwd mm4, mm4
228
    punpckldq mm4, mm4
229
%else
230
    pshufw    mm4, mm4, 0
231
%endif
232
    movq      mm5, mm4
233
    movq      mm6, mm4
234
    movq      mm7, mm4
235
    paddw     mm4, mm0
236
    paddw     mm5, mm1
237
    paddw     mm6, mm2
238
    paddw     mm7, mm3
239
    packuswb  mm4, mm5
240
    packuswb  mm6, mm7
241
    movq [r0+r1+0], mm4
242
    movq [r0+r1+8], mm6
243
    add        r0, r1
244
    dec       r4d
245
    jg .loop
246
    REP_RET
247
%endmacro
248

    
249
PRED16x16_TM_MMX mmx
250
PRED16x16_TM_MMX mmxext
251

    
252
cglobal pred16x16_tm_vp8_sse2, 2,6,6
253
    sub          r0, r1
254
    pxor       xmm2, xmm2
255
    movdqa     xmm0, [r0]
256
    movdqa     xmm1, xmm0
257
    punpcklbw  xmm0, xmm2
258
    punpckhbw  xmm1, xmm2
259
    movzx       r4d, byte [r0-1]
260
    mov         r5d, 8
261
.loop:
262
    movzx       r2d, byte [r0+r1*1-1]
263
    movzx       r3d, byte [r0+r1*2-1]
264
    sub         r2d, r4d
265
    sub         r3d, r4d
266
    movd       xmm2, r2d
267
    movd       xmm4, r3d
268
    pshuflw    xmm2, xmm2, 0
269
    pshuflw    xmm4, xmm4, 0
270
    punpcklqdq xmm2, xmm2
271
    punpcklqdq xmm4, xmm4
272
    movdqa     xmm3, xmm2
273
    movdqa     xmm5, xmm4
274
    paddw      xmm2, xmm0
275
    paddw      xmm3, xmm1
276
    paddw      xmm4, xmm0
277
    paddw      xmm5, xmm1
278
    packuswb   xmm2, xmm3
279
    packuswb   xmm4, xmm5
280
    movdqa [r0+r1*1], xmm2
281
    movdqa [r0+r1*2], xmm4
282
    lea          r0, [r0+r1*2]
283
    dec         r5d
284
    jg .loop
285
    REP_RET
286

    
287
;-----------------------------------------------------------------------------
288
; void pred16x16_plane(uint8_t *src, int stride)
289
;-----------------------------------------------------------------------------
290

    
291
%macro H264_PRED16x16_PLANE 3
292
cglobal pred16x16_plane_%3_%1, 2, 7, %2
293
    mov          r2, r1           ; +stride
294
    neg          r1               ; -stride
295

    
296
    movh         m0, [r0+r1  -1]
297
%if mmsize == 8
298
    pxor         m4, m4
299
    movh         m1, [r0+r1  +3 ]
300
    movh         m2, [r0+r1  +8 ]
301
    movh         m3, [r0+r1  +12]
302
    punpcklbw    m0, m4
303
    punpcklbw    m1, m4
304
    punpcklbw    m2, m4
305
    punpcklbw    m3, m4
306
    pmullw       m0, [pw_m8tom1  ]
307
    pmullw       m1, [pw_m8tom1+8]
308
    pmullw       m2, [pw_1to8    ]
309
    pmullw       m3, [pw_1to8  +8]
310
    paddw        m0, m2
311
    paddw        m1, m3
312
%else ; mmsize == 16
313
%ifidn %1, sse2
314
    pxor         m2, m2
315
    movh         m1, [r0+r1  +8]
316
    punpcklbw    m0, m2
317
    punpcklbw    m1, m2
318
    pmullw       m0, [pw_m8tom1]
319
    pmullw       m1, [pw_1to8]
320
    paddw        m0, m1
321
%else ; ssse3
322
    movhps       m0, [r0+r1  +8]
323
    pmaddubsw    m0, [plane_shuf] ; H coefficients
324
%endif
325
    movhlps      m1, m0
326
%endif
327
    paddw        m0, m1
328
%ifidn %1, mmx
329
    mova         m1, m0
330
    psrlq        m1, 32
331
%elifidn %1, mmx2
332
    pshufw       m1, m0, 0xE
333
%else ; mmsize == 16
334
    pshuflw      m1, m0, 0xE
335
%endif
336
    paddw        m0, m1
337
%ifidn %1, mmx
338
    mova         m1, m0
339
    psrlq        m1, 16
340
%elifidn %1, mmx2
341
    pshufw       m1, m0, 0x1
342
%else
343
    pshuflw      m1, m0, 0x1
344
%endif
345
    paddw        m0, m1           ; sum of H coefficients
346

    
347
%ifidn %3, h264
348
    pmullw       m0, [pw_5]
349
    paddw        m0, [pw_32]
350
    psraw        m0, 6
351
%elifidn %3, rv40
352
    pmullw       m0, [pw_5]
353
    psraw        m0, 6
354
%elifidn %3, svq3
355
    movd        r3d, m0
356
    movsx        r3, r3w
357
    test         r3, r3
358
    lea          r4, [r3+3]
359
    cmovs        r3, r4
360
    sar          r3, 2           ; H/4
361
    lea          r3, [r3*5]      ; 5*(H/4)
362
    test         r3, r3
363
    lea          r4, [r3+15]
364
    cmovs        r3, r4
365
    sar          r3, 4           ; (5*(H/4))/16
366
    movd         m0, r3d
367
%endif
368

    
369
    lea          r4, [r0+r2*8-1]
370
    lea          r3, [r0+r2*4-1]
371
    add          r4, r2
372

    
373
%ifdef ARCH_X86_64
374
%define e_reg r11
375
%else
376
%define e_reg r0
377
%endif
378

    
379
    movzx     e_reg, byte [r3+r2*2   ]
380
    movzx        r5, byte [r4+r1     ]
381
    sub          r5, e_reg
382

    
383
    movzx     e_reg, byte [r3+r2     ]
384
    movzx        r6, byte [r4        ]
385
    sub          r6, e_reg
386
    lea          r5, [r5+r6*2]
387

    
388
    movzx     e_reg, byte [r3+r1     ]
389
    movzx        r6, byte [r4+r2*2   ]
390
    sub          r6, e_reg
391
    lea          r5, [r5+r6*4]
392

    
393
    movzx     e_reg, byte [r3        ]
394
%ifdef ARCH_X86_64
395
    movzx       r10, byte [r4+r2     ]
396
    sub         r10, e_reg
397
%else
398
    movzx        r6, byte [r4+r2     ]
399
    sub          r6, e_reg
400
    lea          r5, [r5+r6*4]
401
    sub          r5, r6
402
%endif
403

    
404
    lea       e_reg, [r3+r1*4]
405
    lea          r3, [r4+r2*4]
406

    
407
    movzx        r4, byte [e_reg+r2  ]
408
    movzx        r6, byte [r3        ]
409
    sub          r6, r4
410
%ifdef ARCH_X86_64
411
    lea          r6, [r10+r6*2]
412
    lea          r5, [r5+r6*2]
413
    add          r5, r6
414
%else
415
    lea          r5, [r5+r6*4]
416
    lea          r5, [r5+r6*2]
417
%endif
418

    
419
    movzx        r4, byte [e_reg     ]
420
%ifdef ARCH_X86_64
421
    movzx       r10, byte [r3   +r2  ]
422
    sub         r10, r4
423
    sub          r5, r10
424
%else
425
    movzx        r6, byte [r3   +r2  ]
426
    sub          r6, r4
427
    lea          r5, [r5+r6*8]
428
    sub          r5, r6
429
%endif
430

    
431
    movzx        r4, byte [e_reg+r1  ]
432
    movzx        r6, byte [r3   +r2*2]
433
    sub          r6, r4
434
%ifdef ARCH_X86_64
435
    add          r6, r10
436
%endif
437
    lea          r5, [r5+r6*8]
438

    
439
    movzx        r4, byte [e_reg+r2*2]
440
    movzx        r6, byte [r3   +r1  ]
441
    sub          r6, r4
442
    lea          r5, [r5+r6*4]
443
    add          r5, r6           ; sum of V coefficients
444

    
445
%ifndef ARCH_X86_64
446
    mov          r0, r0m
447
%endif
448

    
449
%ifidn %3, h264
450
    lea          r5, [r5*5+32]
451
    sar          r5, 6
452
%elifidn %3, rv40
453
    lea          r5, [r5*5]
454
    sar          r5, 6
455
%elifidn %3, svq3
456
    test         r5, r5
457
    lea          r6, [r5+3]
458
    cmovs        r5, r6
459
    sar          r5, 2            ; V/4
460
    lea          r5, [r5*5]       ; 5*(V/4)
461
    test         r5, r5
462
    lea          r6, [r5+15]
463
    cmovs        r5, r6
464
    sar          r5, 4            ; (5*(V/4))/16
465
%endif
466

    
467
    movzx        r4, byte [r0+r1  +15]
468
    movzx        r3, byte [r3+r2*2   ]
469
    lea          r3, [r3+r4+1]
470
    shl          r3, 4
471
    movd        r1d, m0
472
    movsx       r1d, r1w
473
    add         r1d, r5d
474
    add         r3d, r1d
475
    shl         r1d, 3
476
    sub         r3d, r1d          ; a
477

    
478
    movd         m1, r5d
479
    movd         m3, r3d
480
%ifidn %1, mmx
481
    punpcklwd    m0, m0
482
    punpcklwd    m1, m1
483
    punpcklwd    m3, m3
484
    punpckldq    m0, m0
485
    punpckldq    m1, m1
486
    punpckldq    m3, m3
487
%elifidn %1, mmx2
488
    pshufw       m0, m0, 0x0
489
    pshufw       m1, m1, 0x0
490
    pshufw       m3, m3, 0x0
491
%else
492
    pshuflw      m0, m0, 0x0
493
    pshuflw      m1, m1, 0x0
494
    pshuflw      m3, m3, 0x0
495
    punpcklqdq   m0, m0           ; splat H (words)
496
    punpcklqdq   m1, m1           ; splat V (words)
497
    punpcklqdq   m3, m3           ; splat a (words)
498
%endif
499
%ifidn %3, svq3
500
    SWAP          0, 1
501
%endif
502
    mova         m2, m0
503
%if mmsize == 8
504
    mova         m5, m0
505
%endif
506
    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
507
%if mmsize == 16
508
    psllw        m2, 3
509
%else
510
    psllw        m5, 3
511
    psllw        m2, 2
512
    mova         m6, m5
513
    paddw        m6, m2
514
%endif
515
    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
516
    paddw        m2, m0           ; a + {8,9,10,11,12,13,14,15}*H
517
%if mmsize == 8
518
    paddw        m5, m0           ; a + {8,9,10,11}*H
519
    paddw        m6, m0           ; a + {12,13,14,15}*H
520
%endif
521

    
522
    mov          r4, 8
523
.loop
524
    mova         m3, m0           ; b[0..7]
525
    mova         m4, m2           ; b[8..15]
526
    psraw        m3, 5
527
    psraw        m4, 5
528
    packuswb     m3, m4
529
    mova       [r0], m3
530
%if mmsize == 8
531
    mova         m3, m5           ; b[8..11]
532
    mova         m4, m6           ; b[12..15]
533
    psraw        m3, 5
534
    psraw        m4, 5
535
    packuswb     m3, m4
536
    mova     [r0+8], m3
537
%endif
538
    paddw        m0, m1
539
    paddw        m2, m1
540
%if mmsize == 8
541
    paddw        m5, m1
542
    paddw        m6, m1
543
%endif
544

    
545
    mova         m3, m0           ; b[0..7]
546
    mova         m4, m2           ; b[8..15]
547
    psraw        m3, 5
548
    psraw        m4, 5
549
    packuswb     m3, m4
550
    mova    [r0+r2], m3
551
%if mmsize == 8
552
    mova         m3, m5           ; b[8..11]
553
    mova         m4, m6           ; b[12..15]
554
    psraw        m3, 5
555
    psraw        m4, 5
556
    packuswb     m3, m4
557
    mova  [r0+r2+8], m3
558
%endif
559
    paddw        m0, m1
560
    paddw        m2, m1
561
%if mmsize == 8
562
    paddw        m5, m1
563
    paddw        m6, m1
564
%endif
565

    
566
    lea          r0, [r0+r2*2]
567
    dec          r4
568
    jg .loop
569
    REP_RET
570
%endmacro
571

    
572
INIT_MMX
573
H264_PRED16x16_PLANE mmx,   0, h264
574
H264_PRED16x16_PLANE mmx,   0, rv40
575
H264_PRED16x16_PLANE mmx,   0, svq3
576
H264_PRED16x16_PLANE mmx2,  0, h264
577
H264_PRED16x16_PLANE mmx2,  0, rv40
578
H264_PRED16x16_PLANE mmx2,  0, svq3
579
INIT_XMM
580
H264_PRED16x16_PLANE sse2,  8, h264
581
H264_PRED16x16_PLANE sse2,  8, rv40
582
H264_PRED16x16_PLANE sse2,  8, svq3
583
H264_PRED16x16_PLANE ssse3, 8, h264
584
H264_PRED16x16_PLANE ssse3, 8, rv40
585
H264_PRED16x16_PLANE ssse3, 8, svq3
586

    
587
;-----------------------------------------------------------------------------
588
; void pred8x8_plane(uint8_t *src, int stride)
589
;-----------------------------------------------------------------------------
590

    
591
%macro H264_PRED8x8_PLANE 2
592
cglobal pred8x8_plane_%1, 2, 7, %2
593
    mov          r2, r1           ; +stride
594
    neg          r1               ; -stride
595

    
596
    movd         m0, [r0+r1  -1]
597
%if mmsize == 8
598
    pxor         m2, m2
599
    movh         m1, [r0+r1  +4 ]
600
    punpcklbw    m0, m2
601
    punpcklbw    m1, m2
602
    pmullw       m0, [pw_m4to4]
603
    pmullw       m1, [pw_m4to4+8]
604
%else ; mmsize == 16
605
%ifidn %1, sse2
606
    pxor         m2, m2
607
    movd         m1, [r0+r1  +4]
608
    punpckldq    m0, m1
609
    punpcklbw    m0, m2
610
    pmullw       m0, [pw_m4to4]
611
%else ; ssse3
612
    movhps       m0, [r0+r1  +4]   ; this reads 4 bytes more than necessary
613
    pmaddubsw    m0, [plane8_shuf] ; H coefficients
614
%endif
615
    movhlps      m1, m0
616
%endif
617
    paddw        m0, m1
618

    
619
%ifnidn %1, ssse3
620
%ifidn %1, mmx
621
    mova         m1, m0
622
    psrlq        m1, 32
623
%elifidn %1, mmx2
624
    pshufw       m1, m0, 0xE
625
%else ; mmsize == 16
626
    pshuflw      m1, m0, 0xE
627
%endif
628
    paddw        m0, m1
629
%endif ; !ssse3
630

    
631
%ifidn %1, mmx
632
    mova         m1, m0
633
    psrlq        m1, 16
634
%elifidn %1, mmx2
635
    pshufw       m1, m0, 0x1
636
%else
637
    pshuflw      m1, m0, 0x1
638
%endif
639
    paddw        m0, m1           ; sum of H coefficients
640

    
641
    pmullw       m0, [pw_17]
642
    paddw        m0, [pw_16]
643
    psraw        m0, 5
644

    
645
    lea          r4, [r0+r2*4-1]
646
    lea          r3, [r0     -1]
647
    add          r4, r2
648

    
649
%ifdef ARCH_X86_64
650
%define e_reg r11
651
%else
652
%define e_reg r0
653
%endif
654

    
655
    movzx     e_reg, byte [r3+r2*2   ]
656
    movzx        r5, byte [r4+r1     ]
657
    sub          r5, e_reg
658

    
659
    movzx     e_reg, byte [r3        ]
660
%ifdef ARCH_X86_64
661
    movzx       r10, byte [r4+r2     ]
662
    sub         r10, e_reg
663
    sub          r5, r10
664
%else
665
    movzx        r6, byte [r4+r2     ]
666
    sub          r6, e_reg
667
    lea          r5, [r5+r6*4]
668
    sub          r5, r6
669
%endif
670

    
671
    movzx     e_reg, byte [r3+r1     ]
672
    movzx        r6, byte [r4+r2*2   ]
673
    sub          r6, e_reg
674
%ifdef ARCH_X86_64
675
    add          r6, r10
676
%endif
677
    lea          r5, [r5+r6*4]
678

    
679
    movzx     e_reg, byte [r3+r2     ]
680
    movzx        r6, byte [r4        ]
681
    sub          r6, e_reg
682
    lea          r6, [r5+r6*2]
683

    
684
    lea          r5, [r6*9+16]
685
    lea          r5, [r5+r6*8]
686
    sar          r5, 5
687

    
688
%ifndef ARCH_X86_64
689
    mov          r0, r0m
690
%endif
691

    
692
    movzx        r3, byte [r4+r2*2  ]
693
    movzx        r4, byte [r0+r1  +7]
694
    lea          r3, [r3+r4+1]
695
    shl          r3, 4
696
    movd        r1d, m0
697
    movsx       r1d, r1w
698
    add         r1d, r5d
699
    sub         r3d, r1d
700
    add         r1d, r1d
701
    sub         r3d, r1d          ; a
702

    
703
    movd         m1, r5d
704
    movd         m3, r3d
705
%ifidn %1, mmx
706
    punpcklwd    m0, m0
707
    punpcklwd    m1, m1
708
    punpcklwd    m3, m3
709
    punpckldq    m0, m0
710
    punpckldq    m1, m1
711
    punpckldq    m3, m3
712
%elifidn %1, mmx2
713
    pshufw       m0, m0, 0x0
714
    pshufw       m1, m1, 0x0
715
    pshufw       m3, m3, 0x0
716
%else
717
    pshuflw      m0, m0, 0x0
718
    pshuflw      m1, m1, 0x0
719
    pshuflw      m3, m3, 0x0
720
    punpcklqdq   m0, m0           ; splat H (words)
721
    punpcklqdq   m1, m1           ; splat V (words)
722
    punpcklqdq   m3, m3           ; splat a (words)
723
%endif
724
%if mmsize == 8
725
    mova         m2, m0
726
%endif
727
    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
728
    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
729
%if mmsize == 8
730
    psllw        m2, 2
731
    paddw        m2, m0           ; a + {4,5,6,7}*H
732
%endif
733

    
734
    mov          r4, 4
735
ALIGN 16
736
.loop
737
%if mmsize == 16
738
    mova         m3, m0           ; b[0..7]
739
    paddw        m0, m1
740
    psraw        m3, 5
741
    mova         m4, m0           ; V+b[0..7]
742
    paddw        m0, m1
743
    psraw        m4, 5
744
    packuswb     m3, m4
745
    movh       [r0], m3
746
    movhps  [r0+r2], m3
747
%else ; mmsize == 8
748
    mova         m3, m0           ; b[0..3]
749
    mova         m4, m2           ; b[4..7]
750
    paddw        m0, m1
751
    paddw        m2, m1
752
    psraw        m3, 5
753
    psraw        m4, 5
754
    mova         m5, m0           ; V+b[0..3]
755
    mova         m6, m2           ; V+b[4..7]
756
    paddw        m0, m1
757
    paddw        m2, m1
758
    psraw        m5, 5
759
    psraw        m6, 5
760
    packuswb     m3, m4
761
    packuswb     m5, m6
762
    mova       [r0], m3
763
    mova    [r0+r2], m5
764
%endif
765

    
766
    lea          r0, [r0+r2*2]
767
    dec          r4
768
    jg .loop
769
    REP_RET
770
%endmacro
771

    
772
INIT_MMX
773
H264_PRED8x8_PLANE mmx,   0
774
H264_PRED8x8_PLANE mmx2,  0
775
INIT_XMM
776
H264_PRED8x8_PLANE sse2,  8
777
H264_PRED8x8_PLANE ssse3, 8
778

    
779
;-----------------------------------------------------------------------------
780
; void pred8x8_vertical(uint8_t *src, int stride)
781
;-----------------------------------------------------------------------------
782

    
783
cglobal pred8x8_vertical_mmx, 2,2
784
    sub    r0, r1
785
    movq  mm0, [r0]
786
%rep 3
787
    movq [r0+r1*1], mm0
788
    movq [r0+r1*2], mm0
789
    lea    r0, [r0+r1*2]
790
%endrep
791
    movq [r0+r1*1], mm0
792
    movq [r0+r1*2], mm0
793
    RET
794

    
795
;-----------------------------------------------------------------------------
796
; void pred8x8_horizontal(uint8_t *src, int stride)
797
;-----------------------------------------------------------------------------
798

    
799
%macro PRED8x8_H 1
800
cglobal pred8x8_horizontal_%1, 2,3
801
    mov       r2, 4
802
%ifidn %1, ssse3
803
    mova      m2, [pb_3]
804
%endif
805
.loop:
806
    movd      m0, [r0+r1*0-4]
807
    movd      m1, [r0+r1*1-4]
808
%ifidn %1, ssse3
809
    pshufb    m0, m2
810
    pshufb    m1, m2
811
%else
812
    punpcklbw m0, m0
813
    punpcklbw m1, m1
814
%ifidn %1, mmxext
815
    pshufw    m0, m0, 0xff
816
    pshufw    m1, m1, 0xff
817
%else
818
    punpckhwd m0, m0
819
    punpckhwd m1, m1
820
    punpckhdq m0, m0
821
    punpckhdq m1, m1
822
%endif
823
%endif
824
    mova [r0+r1*0], m0
825
    mova [r0+r1*1], m1
826
    lea       r0, [r0+r1*2]
827
    dec       r2
828
    jg .loop
829
    REP_RET
830
%endmacro
831

    
832
INIT_MMX
833
PRED8x8_H mmx
834
PRED8x8_H mmxext
835
PRED8x8_H ssse3
836

    
837
;-----------------------------------------------------------------------------
838
; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
839
;-----------------------------------------------------------------------------
840
%ifdef CONFIG_GPL
841
cglobal pred8x8_top_dc_mmxext, 2,5
842
    sub         r0, r1
843
    movq       mm0, [r0]
844
    pxor       mm1, mm1
845
    pxor       mm2, mm2
846
    lea         r2, [r0+r1*2]
847
    punpckhbw  mm1, mm0
848
    punpcklbw  mm0, mm2
849
    psadbw     mm1, mm2        ; s1
850
    lea         r3, [r2+r1*2]
851
    psadbw     mm0, mm2        ; s0
852
    psrlw      mm1, 1
853
    psrlw      mm0, 1
854
    pavgw      mm1, mm2
855
    lea         r4, [r3+r1*2]
856
    pavgw      mm0, mm2
857
    pshufw     mm1, mm1, 0
858
    pshufw     mm0, mm0, 0     ; dc0 (w)
859
    packuswb   mm0, mm1        ; dc0,dc1 (b)
860
    movq [r0+r1*1], mm0
861
    movq [r0+r1*2], mm0
862
    lea         r0, [r3+r1*2]
863
    movq [r2+r1*1], mm0
864
    movq [r2+r1*2], mm0
865
    movq [r3+r1*1], mm0
866
    movq [r3+r1*2], mm0
867
    movq [r0+r1*1], mm0
868
    movq [r0+r1*2], mm0
869
    RET
870
%endif
871

    
872
;-----------------------------------------------------------------------------
873
; void pred8x8_dc_mmxext(uint8_t *src, int stride)
874
;-----------------------------------------------------------------------------
875
%ifdef CONFIG_GPL
876
INIT_MMX
877
cglobal pred8x8_dc_mmxext, 2,5
878
    sub       r0, r1
879
    pxor      m7, m7
880
    movd      m0, [r0+0]
881
    movd      m1, [r0+4]
882
    psadbw    m0, m7            ; s0
883
    mov       r4, r0
884
    psadbw    m1, m7            ; s1
885

    
886
    movzx    r2d, byte [r0+r1*1-1]
887
    movzx    r3d, byte [r0+r1*2-1]
888
    lea       r0, [r0+r1*2]
889
    add      r2d, r3d
890
    movzx    r3d, byte [r0+r1*1-1]
891
    add      r2d, r3d
892
    movzx    r3d, byte [r0+r1*2-1]
893
    add      r2d, r3d
894
    lea       r0, [r0+r1*2]
895
    movd      m2, r2d            ; s2
896
    movzx    r2d, byte [r0+r1*1-1]
897
    movzx    r3d, byte [r0+r1*2-1]
898
    lea       r0, [r0+r1*2]
899
    add      r2d, r3d
900
    movzx    r3d, byte [r0+r1*1-1]
901
    add      r2d, r3d
902
    movzx    r3d, byte [r0+r1*2-1]
903
    add      r2d, r3d
904
    movd      m3, r2d            ; s3
905

    
906
    punpcklwd m0, m1
907
    mov       r0, r4
908
    punpcklwd m2, m3
909
    punpckldq m0, m2            ; s0, s1, s2, s3
910
    pshufw    m3, m0, 11110110b ; s2, s1, s3, s3
911
    lea       r2, [r0+r1*2]
912
    pshufw    m0, m0, 01110100b ; s0, s1, s3, s1
913
    paddw     m0, m3
914
    lea       r3, [r2+r1*2]
915
    psrlw     m0, 2
916
    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
917
    lea       r4, [r3+r1*2]
918
    packuswb  m0, m0
919
    punpcklbw m0, m0
920
    movq      m1, m0
921
    punpcklbw m0, m0
922
    punpckhbw m1, m1
923
    movq [r0+r1*1], m0
924
    movq [r0+r1*2], m0
925
    movq [r2+r1*1], m0
926
    movq [r2+r1*2], m0
927
    movq [r3+r1*1], m1
928
    movq [r3+r1*2], m1
929
    movq [r4+r1*1], m1
930
    movq [r4+r1*2], m1
931
    RET
932
%endif
933

    
934
;-----------------------------------------------------------------------------
935
; void pred8x8_dc_rv40(uint8_t *src, int stride)
936
;-----------------------------------------------------------------------------
937

    
938
cglobal pred8x8_dc_rv40_mmxext, 2,7
939
    mov       r4, r0
940
    sub       r0, r1
941
    pxor      mm0, mm0
942
    psadbw    mm0, [r0]
943
    dec        r0
944
    movzx     r5d, byte [r0+r1*1]
945
    movd      r6d, mm0
946
    lea        r0, [r0+r1*2]
947
%rep 3
948
    movzx     r2d, byte [r0+r1*0]
949
    movzx     r3d, byte [r0+r1*1]
950
    add       r5d, r2d
951
    add       r6d, r3d
952
    lea        r0, [r0+r1*2]
953
%endrep
954
    movzx     r2d, byte [r0+r1*0]
955
    add       r5d, r6d
956
    lea       r2d, [r2+r5+8]
957
    shr       r2d, 4
958
    movd      mm0, r2d
959
    punpcklbw mm0, mm0
960
    pshufw    mm0, mm0, 0
961
    mov       r3d, 4
962
.loop:
963
    movq [r4+r1*0], mm0
964
    movq [r4+r1*1], mm0
965
    lea   r4, [r4+r1*2]
966
    dec   r3d
967
    jg .loop
968
    REP_RET
969

    
970
;-----------------------------------------------------------------------------
971
; void pred8x8_tm_vp8(uint8_t *src, int stride)
972
;-----------------------------------------------------------------------------
973

    
974
%macro PRED8x8_TM_MMX 1
975
cglobal pred8x8_tm_vp8_%1, 2,6
976
    sub        r0, r1
977
    pxor      mm7, mm7
978
    movq      mm0, [r0]
979
    movq      mm1, mm0
980
    punpcklbw mm0, mm7
981
    punpckhbw mm1, mm7
982
    movzx     r4d, byte [r0-1]
983
    mov       r5d, 4
984
.loop:
985
    movzx     r2d, byte [r0+r1*1-1]
986
    movzx     r3d, byte [r0+r1*2-1]
987
    sub       r2d, r4d
988
    sub       r3d, r4d
989
    movd      mm2, r2d
990
    movd      mm4, r3d
991
%ifidn %1, mmx
992
    punpcklwd mm2, mm2
993
    punpcklwd mm4, mm4
994
    punpckldq mm2, mm2
995
    punpckldq mm4, mm4
996
%else
997
    pshufw    mm2, mm2, 0
998
    pshufw    mm4, mm4, 0
999
%endif
1000
    movq      mm3, mm2
1001
    movq      mm5, mm4
1002
    paddw     mm2, mm0
1003
    paddw     mm3, mm1
1004
    paddw     mm4, mm0
1005
    paddw     mm5, mm1
1006
    packuswb  mm2, mm3
1007
    packuswb  mm4, mm5
1008
    movq [r0+r1*1], mm2
1009
    movq [r0+r1*2], mm4
1010
    lea        r0, [r0+r1*2]
1011
    dec       r5d
1012
    jg .loop
1013
    REP_RET
1014
%endmacro
1015

    
1016
PRED8x8_TM_MMX mmx
1017
PRED8x8_TM_MMX mmxext
1018

    
1019
cglobal pred8x8_tm_vp8_sse2, 2,6,4
1020
    sub          r0, r1
1021
    pxor       xmm1, xmm1
1022
    movq       xmm0, [r0]
1023
    punpcklbw  xmm0, xmm1
1024
    movzx       r4d, byte [r0-1]
1025
    mov         r5d, 4
1026
.loop:
1027
    movzx       r2d, byte [r0+r1*1-1]
1028
    movzx       r3d, byte [r0+r1*2-1]
1029
    sub         r2d, r4d
1030
    sub         r3d, r4d
1031
    movd       xmm2, r2d
1032
    movd       xmm3, r3d
1033
    pshuflw    xmm2, xmm2, 0
1034
    pshuflw    xmm3, xmm3, 0
1035
    punpcklqdq xmm2, xmm2
1036
    punpcklqdq xmm3, xmm3
1037
    paddw      xmm2, xmm0
1038
    paddw      xmm3, xmm0
1039
    packuswb   xmm2, xmm3
1040
    movq   [r0+r1*1], xmm2
1041
    movhps [r0+r1*2], xmm2
1042
    lea          r0, [r0+r1*2]
1043
    dec         r5d
1044
    jg .loop
1045
    REP_RET
1046

    
1047
cglobal pred8x8_tm_vp8_ssse3, 2,3,6
1048
    sub          r0, r1
1049
    movdqa     xmm4, [tm_shuf]
1050
    pxor       xmm1, xmm1
1051
    movq       xmm0, [r0]
1052
    punpcklbw  xmm0, xmm1
1053
    movd       xmm5, [r0-4]
1054
    pshufb     xmm5, xmm4
1055
    mov         r2d, 4
1056
.loop:
1057
    movd       xmm2, [r0+r1*1-4]
1058
    movd       xmm3, [r0+r1*2-4]
1059
    pshufb     xmm2, xmm4
1060
    pshufb     xmm3, xmm4
1061
    psubw      xmm2, xmm5
1062
    psubw      xmm3, xmm5
1063
    paddw      xmm2, xmm0
1064
    paddw      xmm3, xmm0
1065
    packuswb   xmm2, xmm3
1066
    movq   [r0+r1*1], xmm2
1067
    movhps [r0+r1*2], xmm2
1068
    lea          r0, [r0+r1*2]
1069
    dec         r2d
1070
    jg .loop
1071
    REP_RET
1072

    
1073
; dest, left, right, src, tmp
1074
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1075
%macro PRED4x4_LOWPASS 5
1076
    mova    %5, %2
1077
    pavgb   %2, %3
1078
    pxor    %3, %5
1079
    mova    %1, %4
1080
    pand    %3, [pb_1]
1081
    psubusb %2, %3
1082
    pavgb   %1, %2
1083
%endmacro
1084

    
1085
;-----------------------------------------------------------------------------
1086
; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1087
;-----------------------------------------------------------------------------
1088
%ifdef CONFIG_GPL
1089
%macro PRED8x8L_TOP_DC 1
1090
cglobal pred8x8l_top_dc_%1, 4,4
1091
    sub          r0, r3
1092
    pxor        mm7, mm7
1093
    movq        mm0, [r0-8]
1094
    movq        mm3, [r0]
1095
    movq        mm1, [r0+8]
1096
    movq        mm2, mm3
1097
    movq        mm4, mm3
1098
    PALIGNR     mm2, mm0, 7, mm0
1099
    PALIGNR     mm1, mm4, 1, mm4
1100
    test         r1, r1 ; top_left
1101
    jz .fix_lt_2
1102
    test         r2, r2 ; top_right
1103
    jz .fix_tr_1
1104
    jmp .body
1105
.fix_lt_2:
1106
    movq        mm5, mm3
1107
    pxor        mm5, mm2
1108
    psllq       mm5, 56
1109
    psrlq       mm5, 56
1110
    pxor        mm2, mm5
1111
    test         r2, r2 ; top_right
1112
    jnz .body
1113
.fix_tr_1:
1114
    movq        mm5, mm3
1115
    pxor        mm5, mm1
1116
    psrlq       mm5, 56
1117
    psllq       mm5, 56
1118
    pxor        mm1, mm5
1119
.body
1120
    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1121
    psadbw   mm7, mm0
1122
    paddw    mm7, [pw_4]
1123
    psrlw    mm7, 3
1124
    pshufw   mm7, mm7, 0
1125
    packuswb mm7, mm7
1126
%rep 3
1127
    movq [r0+r3*1], mm7
1128
    movq [r0+r3*2], mm7
1129
    lea    r0, [r0+r3*2]
1130
%endrep
1131
    movq [r0+r3*1], mm7
1132
    movq [r0+r3*2], mm7
1133
    RET
1134
%endmacro
1135

    
1136
INIT_MMX
1137
%define PALIGNR PALIGNR_MMX
1138
PRED8x8L_TOP_DC mmxext
1139
%define PALIGNR PALIGNR_SSSE3
1140
PRED8x8L_TOP_DC ssse3
1141
%endif
1142

    
1143
;-----------------------------------------------------------------------------
1144
;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1145
;-----------------------------------------------------------------------------
1146
%ifdef CONFIG_GPL
1147
%macro PRED8x8L_DC 1
1148
cglobal pred8x8l_dc_%1, 4,5
1149
    sub          r0, r3
1150
    lea          r4, [r0+r3*2]
1151
    movq        mm0, [r0+r3*1-8]
1152
    punpckhbw   mm0, [r0+r3*0-8]
1153
    movq        mm1, [r4+r3*1-8]
1154
    punpckhbw   mm1, [r0+r3*2-8]
1155
    mov          r4, r0
1156
    punpckhwd   mm1, mm0
1157
    lea          r0, [r0+r3*4]
1158
    movq        mm2, [r0+r3*1-8]
1159
    punpckhbw   mm2, [r0+r3*0-8]
1160
    lea          r0, [r0+r3*2]
1161
    movq        mm3, [r0+r3*1-8]
1162
    punpckhbw   mm3, [r0+r3*0-8]
1163
    punpckhwd   mm3, mm2
1164
    punpckhdq   mm3, mm1
1165
    lea          r0, [r0+r3*2]
1166
    movq        mm0, [r0+r3*0-8]
1167
    movq        mm1, [r4]
1168
    mov          r0, r4
1169
    movq        mm4, mm3
1170
    movq        mm2, mm3
1171
    PALIGNR     mm4, mm0, 7, mm0
1172
    PALIGNR     mm1, mm2, 1, mm2
1173
    test        r1, r1
1174
    jnz .do_left
1175
.fix_lt_1:
1176
    movq        mm5, mm3
1177
    pxor        mm5, mm4
1178
    psrlq       mm5, 56
1179
    psllq       mm5, 48
1180
    pxor        mm1, mm5
1181
    jmp .do_left
1182
.fix_lt_2:
1183
    movq        mm5, mm3
1184
    pxor        mm5, mm2
1185
    psllq       mm5, 56
1186
    psrlq       mm5, 56
1187
    pxor        mm2, mm5
1188
    test         r2, r2
1189
    jnz .body
1190
.fix_tr_1:
1191
    movq        mm5, mm3
1192
    pxor        mm5, mm1
1193
    psrlq       mm5, 56
1194
    psllq       mm5, 56
1195
    pxor        mm1, mm5
1196
    jmp .body
1197
.do_left:
1198
    movq        mm0, mm4
1199
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1200
    movq        mm4, mm0
1201
    movq        mm7, mm2
1202
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1203
    psllq       mm1, 56
1204
    PALIGNR     mm7, mm1, 7, mm3
1205
    movq        mm0, [r0-8]
1206
    movq        mm3, [r0]
1207
    movq        mm1, [r0+8]
1208
    movq        mm2, mm3
1209
    movq        mm4, mm3
1210
    PALIGNR     mm2, mm0, 7, mm0
1211
    PALIGNR     mm1, mm4, 1, mm4
1212
    test         r1, r1
1213
    jz .fix_lt_2
1214
    test         r2, r2
1215
    jz .fix_tr_1
1216
.body
1217
    lea          r1, [r0+r3*2]
1218
    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1219
    pxor        mm0, mm0
1220
    pxor        mm1, mm1
1221
    lea          r2, [r1+r3*2]
1222
    psadbw      mm0, mm7
1223
    psadbw      mm1, mm6
1224
    paddw       mm0, [pw_8]
1225
    paddw       mm0, mm1
1226
    lea          r4, [r2+r3*2]
1227
    psrlw       mm0, 4
1228
    pshufw      mm0, mm0, 0
1229
    packuswb    mm0, mm0
1230
    movq [r0+r3*1], mm0
1231
    movq [r0+r3*2], mm0
1232
    movq [r1+r3*1], mm0
1233
    movq [r1+r3*2], mm0
1234
    movq [r2+r3*1], mm0
1235
    movq [r2+r3*2], mm0
1236
    movq [r4+r3*1], mm0
1237
    movq [r4+r3*2], mm0
1238
    RET
1239
%endmacro
1240
INIT_MMX
1241
%define PALIGNR PALIGNR_MMX
1242
PRED8x8L_DC mmxext
1243
%define PALIGNR PALIGNR_SSSE3
1244
PRED8x8L_DC ssse3
1245
%endif
1246

    
1247
;-----------------------------------------------------------------------------
1248
; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
1249
;-----------------------------------------------------------------------------
1250
%ifdef CONFIG_GPL
1251
%macro PRED8x8L_HORIZONTAL 1
1252
cglobal pred8x8l_horizontal_%1, 4,4
1253
    sub          r0, r3
1254
    lea          r2, [r0+r3*2]
1255
    movq        mm0, [r0+r3*1-8]
1256
    punpckhbw   mm0, [r0+r3*0-8]
1257
    movq        mm1, [r2+r3*1-8]
1258
    punpckhbw   mm1, [r0+r3*2-8]
1259
    mov          r2, r0
1260
    punpckhwd   mm1, mm0
1261
    lea          r0, [r0+r3*4]
1262
    movq        mm2, [r0+r3*1-8]
1263
    punpckhbw   mm2, [r0+r3*0-8]
1264
    lea          r0, [r0+r3*2]
1265
    movq        mm3, [r0+r3*1-8]
1266
    punpckhbw   mm3, [r0+r3*0-8]
1267
    punpckhwd   mm3, mm2
1268
    punpckhdq   mm3, mm1
1269
    lea          r0, [r0+r3*2]
1270
    movq        mm0, [r0+r3*0-8]
1271
    movq        mm1, [r2]
1272
    mov          r0, r2
1273
    movq        mm4, mm3
1274
    movq        mm2, mm3
1275
    PALIGNR     mm4, mm0, 7, mm0
1276
    PALIGNR     mm1, mm2, 1, mm2
1277
    test        r1, r1 ; top_left
1278
    jnz .do_left
1279
.fix_lt_1:
1280
    movq        mm5, mm3
1281
    pxor        mm5, mm4
1282
    psrlq       mm5, 56
1283
    psllq       mm5, 48
1284
    pxor        mm1, mm5
1285
.do_left:
1286
    movq        mm0, mm4
1287
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1288
    movq        mm4, mm0
1289
    movq        mm7, mm2
1290
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1291
    psllq       mm1, 56
1292
    PALIGNR     mm7, mm1, 7, mm3
1293
    movq        mm3, mm7
1294
    lea         r1, [r0+r3*2]
1295
    movq       mm7, mm3
1296
    punpckhbw  mm3, mm3
1297
    punpcklbw  mm7, mm7
1298
    pshufw     mm0, mm3, 0xff
1299
    pshufw     mm1, mm3, 0xaa
1300
    lea         r2, [r1+r3*2]
1301
    pshufw     mm2, mm3, 0x55
1302
    pshufw     mm3, mm3, 0x00
1303
    pshufw     mm4, mm7, 0xff
1304
    pshufw     mm5, mm7, 0xaa
1305
    pshufw     mm6, mm7, 0x55
1306
    pshufw     mm7, mm7, 0x00
1307
    movq [r0+r3*1], mm0
1308
    movq [r0+r3*2], mm1
1309
    movq [r1+r3*1], mm2
1310
    movq [r1+r3*2], mm3
1311
    movq [r2+r3*1], mm4
1312
    movq [r2+r3*2], mm5
1313
    lea         r0, [r2+r3*2]
1314
    movq [r0+r3*1], mm6
1315
    movq [r0+r3*2], mm7
1316
    RET
1317
%endmacro
1318

    
1319
INIT_MMX
1320
%define PALIGNR PALIGNR_MMX
1321
PRED8x8L_HORIZONTAL mmxext
1322
%define PALIGNR PALIGNR_SSSE3
1323
PRED8x8L_HORIZONTAL ssse3
1324
%endif
1325

    
1326
;-----------------------------------------------------------------------------
1327
; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
1328
;-----------------------------------------------------------------------------
1329
%ifdef CONFIG_GPL
1330
%macro PRED8x8L_VERTICAL 1
1331
cglobal pred8x8l_vertical_%1, 4,4
1332
    sub          r0, r3
1333
    movq        mm0, [r0-8]
1334
    movq        mm3, [r0]
1335
    movq        mm1, [r0+8]
1336
    movq        mm2, mm3
1337
    movq        mm4, mm3
1338
    PALIGNR     mm2, mm0, 7, mm0
1339
    PALIGNR     mm1, mm4, 1, mm4
1340
    test         r1, r1 ; top_left
1341
    jz .fix_lt_2
1342
    test         r2, r2 ; top_right
1343
    jz .fix_tr_1
1344
    jmp .body
1345
.fix_lt_2:
1346
    movq        mm5, mm3
1347
    pxor        mm5, mm2
1348
    psllq       mm5, 56
1349
    psrlq       mm5, 56
1350
    pxor        mm2, mm5
1351
    test         r2, r2 ; top_right
1352
    jnz .body
1353
.fix_tr_1:
1354
    movq        mm5, mm3
1355
    pxor        mm5, mm1
1356
    psrlq       mm5, 56
1357
    psllq       mm5, 56
1358
    pxor        mm1, mm5
1359
.body
1360
    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1361
%rep 3
1362
    movq [r0+r3*1], mm0
1363
    movq [r0+r3*2], mm0
1364
    lea    r0, [r0+r3*2]
1365
%endrep
1366
    movq [r0+r3*1], mm0
1367
    movq [r0+r3*2], mm0
1368
    RET
1369
%endmacro
1370

    
1371
INIT_MMX
1372
%define PALIGNR PALIGNR_MMX
1373
PRED8x8L_VERTICAL mmxext
1374
%define PALIGNR PALIGNR_SSSE3
1375
PRED8x8L_VERTICAL ssse3
1376
%endif
1377

    
1378
;-----------------------------------------------------------------------------
1379
;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1380
;-----------------------------------------------------------------------------
1381
%ifdef CONFIG_GPL
1382
%macro PRED8x8L_DOWN_LEFT 1
1383
cglobal pred8x8l_down_left_%1, 4,4
1384
    sub          r0, r3
1385
    movq        mm0, [r0-8]
1386
    movq        mm3, [r0]
1387
    movq        mm1, [r0+8]
1388
    movq        mm2, mm3
1389
    movq        mm4, mm3
1390
    PALIGNR     mm2, mm0, 7, mm0
1391
    PALIGNR     mm1, mm4, 1, mm4
1392
    test         r1, r1 ; top_left
1393
    jz .fix_lt_2
1394
    test         r2, r2 ; top_right
1395
    jz .fix_tr_1
1396
    jmp .do_top
1397
.fix_lt_2:
1398
    movq        mm5, mm3
1399
    pxor        mm5, mm2
1400
    psllq       mm5, 56
1401
    psrlq       mm5, 56
1402
    pxor        mm2, mm5
1403
    test         r2, r2 ; top_right
1404
    jnz .do_top
1405
.fix_tr_1:
1406
    movq        mm5, mm3
1407
    pxor        mm5, mm1
1408
    psrlq       mm5, 56
1409
    psllq       mm5, 56
1410
    pxor        mm1, mm5
1411
    jmp .do_top
1412
.fix_tr_2:
1413
    punpckhbw   mm3, mm3
1414
    pshufw      mm1, mm3, 0xFF
1415
    jmp .do_topright
1416
.do_top:
1417
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1418
    movq2dq    xmm3, mm4
1419
    test         r2, r2 ; top_right
1420
    jz .fix_tr_2
1421
    movq        mm0, [r0+8]
1422
    movq        mm5, mm0
1423
    movq        mm2, mm0
1424
    movq        mm4, mm0
1425
    psrlq       mm5, 56
1426
    PALIGNR     mm2, mm3, 7, mm3
1427
    PALIGNR     mm5, mm4, 1, mm4
1428
    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1429
.do_topright:
1430
    movq2dq    xmm4, mm1
1431
    psrlq       mm1, 56
1432
    movq2dq    xmm5, mm1
1433
    lea         r1, [r0+r3*2]
1434
    pslldq    xmm4, 8
1435
    por       xmm3, xmm4
1436
    movdqa    xmm2, xmm3
1437
    psrldq    xmm2, 1
1438
    pslldq    xmm5, 15
1439
    por       xmm2, xmm5
1440
    lea         r2, [r1+r3*2]
1441
    movdqa    xmm1, xmm3
1442
    pslldq    xmm1, 1
1443
INIT_XMM
1444
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1445
    psrldq    xmm0, 1
1446
    movq [r0+r3*1], xmm0
1447
    psrldq    xmm0, 1
1448
    movq [r0+r3*2], xmm0
1449
    psrldq    xmm0, 1
1450
    lea         r0, [r2+r3*2]
1451
    movq [r1+r3*1], xmm0
1452
    psrldq    xmm0, 1
1453
    movq [r1+r3*2], xmm0
1454
    psrldq    xmm0, 1
1455
    movq [r2+r3*1], xmm0
1456
    psrldq    xmm0, 1
1457
    movq [r2+r3*2], xmm0
1458
    psrldq    xmm0, 1
1459
    movq [r0+r3*1], xmm0
1460
    psrldq    xmm0, 1
1461
    movq [r0+r3*2], xmm0
1462
    RET
1463
%endmacro
1464

    
1465
INIT_MMX
1466
%define PALIGNR PALIGNR_MMX
1467
PRED8x8L_DOWN_LEFT sse2
1468
INIT_MMX
1469
%define PALIGNR PALIGNR_SSSE3
1470
PRED8x8L_DOWN_LEFT ssse3
1471
%endif
1472

    
1473
;-----------------------------------------------------------------------------
1474
;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
1475
;-----------------------------------------------------------------------------
1476
%ifdef CONFIG_GPL
1477
INIT_MMX
1478
%define PALIGNR PALIGNR_MMX
1479
cglobal pred8x8l_down_right_mmxext, 4,5
1480
    sub          r0, r3
1481
    lea          r4, [r0+r3*2]
1482
    movq        mm0, [r0+r3*1-8]
1483
    punpckhbw   mm0, [r0+r3*0-8]
1484
    movq        mm1, [r4+r3*1-8]
1485
    punpckhbw   mm1, [r0+r3*2-8]
1486
    mov          r4, r0
1487
    punpckhwd   mm1, mm0
1488
    lea          r0, [r0+r3*4]
1489
    movq        mm2, [r0+r3*1-8]
1490
    punpckhbw   mm2, [r0+r3*0-8]
1491
    lea          r0, [r0+r3*2]
1492
    movq        mm3, [r0+r3*1-8]
1493
    punpckhbw   mm3, [r0+r3*0-8]
1494
    punpckhwd   mm3, mm2
1495
    punpckhdq   mm3, mm1
1496
    lea          r0, [r0+r3*2]
1497
    movq        mm0, [r0+r3*0-8]
1498
    movq        mm1, [r4]
1499
    mov          r0, r4
1500
    movq        mm4, mm3
1501
    movq        mm2, mm3
1502
    PALIGNR     mm4, mm0, 7, mm0
1503
    PALIGNR     mm1, mm2, 1, mm2
1504
    test        r1, r1 ; top_left
1505
    jz .fix_lt_1
1506
.do_left:
1507
    movq        mm0, mm4
1508
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1509
    movq        mm4, mm0
1510
    movq        mm7, mm2
1511
    movq        mm6, mm2
1512
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1513
    psllq       mm1, 56
1514
    PALIGNR     mm7, mm1, 7, mm3
1515
    movq        mm0, [r0-8]
1516
    movq        mm3, [r0]
1517
    movq        mm1, [r0+8]
1518
    movq        mm2, mm3
1519
    movq        mm4, mm3
1520
    PALIGNR     mm2, mm0, 7, mm0
1521
    PALIGNR     mm1, mm4, 1, mm4
1522
    test         r1, r1 ; top_left
1523
    jz .fix_lt_2
1524
    test         r2, r2 ; top_right
1525
    jz .fix_tr_1
1526
.do_top:
1527
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1528
    movq        mm5, mm4
1529
    jmp .body
1530
.fix_lt_1:
1531
    movq        mm5, mm3
1532
    pxor        mm5, mm4
1533
    psrlq       mm5, 56
1534
    psllq       mm5, 48
1535
    pxor        mm1, mm5
1536
    jmp .do_left
1537
.fix_lt_2:
1538
    movq        mm5, mm3
1539
    pxor        mm5, mm2
1540
    psllq       mm5, 56
1541
    psrlq       mm5, 56
1542
    pxor        mm2, mm5
1543
    test         r2, r2 ; top_right
1544
    jnz .do_top
1545
.fix_tr_1:
1546
    movq        mm5, mm3
1547
    pxor        mm5, mm1
1548
    psrlq       mm5, 56
1549
    psllq       mm5, 56
1550
    pxor        mm1, mm5
1551
    jmp .do_top
1552
.body
1553
    lea         r1, [r0+r3*2]
1554
    movq       mm1, mm7
1555
    movq       mm7, mm5
1556
    movq       mm5, mm6
1557
    movq       mm2, mm7
1558
    lea         r2, [r1+r3*2]
1559
    PALIGNR    mm2, mm6, 1, mm0
1560
    movq       mm3, mm7
1561
    PALIGNR    mm3, mm6, 7, mm0
1562
    movq       mm4, mm7
1563
    lea         r4, [r2+r3*2]
1564
    psrlq      mm4, 8
1565
    PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1566
    PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1567
    movq [r4+r3*2], mm0
1568
    movq       mm2, mm1
1569
    psrlq      mm0, 8
1570
    psllq      mm2, 56
1571
    psrlq      mm1, 8
1572
    por        mm0, mm2
1573
    movq [r4+r3*1], mm0
1574
    movq       mm2, mm1
1575
    psrlq      mm0, 8
1576
    psllq      mm2, 56
1577
    psrlq      mm1, 8
1578
    por        mm0, mm2
1579
    movq [r2+r3*2], mm0
1580
    movq       mm2, mm1
1581
    psrlq      mm0, 8
1582
    psllq      mm2, 56
1583
    psrlq      mm1, 8
1584
    por        mm0, mm2
1585
    movq [r2+r3*1], mm0
1586
    movq       mm2, mm1
1587
    psrlq      mm0, 8
1588
    psllq      mm2, 56
1589
    psrlq      mm1, 8
1590
    por        mm0, mm2
1591
    movq [r1+r3*2], mm0
1592
    movq       mm2, mm1
1593
    psrlq      mm0, 8
1594
    psllq      mm2, 56
1595
    psrlq      mm1, 8
1596
    por        mm0, mm2
1597
    movq [r1+r3*1], mm0
1598
    movq       mm2, mm1
1599
    psrlq      mm0, 8
1600
    psllq      mm2, 56
1601
    psrlq      mm1, 8
1602
    por        mm0, mm2
1603
    movq [r0+r3*2], mm0
1604
    psrlq      mm0, 8
1605
    psllq      mm1, 56
1606
    por        mm0, mm1
1607
    movq [r0+r3*1], mm0
1608
    RET
1609

    
1610
%macro PRED8x8L_DOWN_RIGHT 1
1611
cglobal pred8x8l_down_right_%1, 4,5
1612
    sub          r0, r3
1613
    lea          r4, [r0+r3*2]
1614
    movq        mm0, [r0+r3*1-8]
1615
    punpckhbw   mm0, [r0+r3*0-8]
1616
    movq        mm1, [r4+r3*1-8]
1617
    punpckhbw   mm1, [r0+r3*2-8]
1618
    mov          r4, r0
1619
    punpckhwd   mm1, mm0
1620
    lea          r0, [r0+r3*4]
1621
    movq        mm2, [r0+r3*1-8]
1622
    punpckhbw   mm2, [r0+r3*0-8]
1623
    lea          r0, [r0+r3*2]
1624
    movq        mm3, [r0+r3*1-8]
1625
    punpckhbw   mm3, [r0+r3*0-8]
1626
    punpckhwd   mm3, mm2
1627
    punpckhdq   mm3, mm1
1628
    lea          r0, [r0+r3*2]
1629
    movq        mm0, [r0+r3*0-8]
1630
    movq        mm1, [r4]
1631
    mov          r0, r4
1632
    movq        mm4, mm3
1633
    movq        mm2, mm3
1634
    PALIGNR     mm4, mm0, 7, mm0
1635
    PALIGNR     mm1, mm2, 1, mm2
1636
    test        r1, r1
1637
    jz .fix_lt_1
1638
    jmp .do_left
1639
.fix_lt_1:
1640
    movq        mm5, mm3
1641
    pxor        mm5, mm4
1642
    psrlq       mm5, 56
1643
    psllq       mm5, 48
1644
    pxor        mm1, mm5
1645
    jmp .do_left
1646
.fix_lt_2:
1647
    movq        mm5, mm3
1648
    pxor        mm5, mm2
1649
    psllq       mm5, 56
1650
    psrlq       mm5, 56
1651
    pxor        mm2, mm5
1652
    test         r2, r2
1653
    jnz .do_top
1654
.fix_tr_1:
1655
    movq        mm5, mm3
1656
    pxor        mm5, mm1
1657
    psrlq       mm5, 56
1658
    psllq       mm5, 56
1659
    pxor        mm1, mm5
1660
    jmp .do_top
1661
.do_left:
1662
    movq        mm0, mm4
1663
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1664
    movq        mm4, mm0
1665
    movq        mm7, mm2
1666
    movq2dq    xmm3, mm2
1667
    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1668
    psllq       mm1, 56
1669
    PALIGNR     mm7, mm1, 7, mm3
1670
    movq2dq    xmm1, mm7
1671
    movq        mm0, [r0-8]
1672
    movq        mm3, [r0]
1673
    movq        mm1, [r0+8]
1674
    movq        mm2, mm3
1675
    movq        mm4, mm3
1676
    PALIGNR     mm2, mm0, 7, mm0
1677
    PALIGNR     mm1, mm4, 1, mm4
1678
    test         r1, r1
1679
    jz .fix_lt_2
1680
    test         r2, r2
1681
    jz .fix_tr_1
1682
.do_top:
1683
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1684
    movq2dq   xmm4, mm4
1685
    lea         r1, [r0+r3*2]
1686
    movdqa    xmm0, xmm3
1687
    pslldq    xmm4, 8
1688
    por       xmm3, xmm4
1689
    lea         r2, [r1+r3*2]
1690
    pslldq    xmm4, 1
1691
    por       xmm1, xmm4
1692
    psrldq    xmm0, 7
1693
    pslldq    xmm0, 15
1694
    psrldq    xmm0, 7
1695
    por       xmm1, xmm0
1696
    lea         r0, [r2+r3*2]
1697
    movdqa    xmm2, xmm3
1698
    psrldq    xmm2, 1
1699
INIT_XMM
1700
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1701
    movdqa    xmm1, xmm0
1702
    psrldq    xmm1, 1
1703
    movq [r0+r3*2], xmm0
1704
    movq [r0+r3*1], xmm1
1705
    psrldq    xmm0, 2
1706
    psrldq    xmm1, 2
1707
    movq [r2+r3*2], xmm0
1708
    movq [r2+r3*1], xmm1
1709
    psrldq    xmm0, 2
1710
    psrldq    xmm1, 2
1711
    movq [r1+r3*2], xmm0
1712
    movq [r1+r3*1], xmm1
1713
    psrldq    xmm0, 2
1714
    psrldq    xmm1, 2
1715
    movq [r4+r3*2], xmm0
1716
    movq [r4+r3*1], xmm1
1717
    RET
1718
%endmacro
1719

    
1720
INIT_MMX
1721
%define PALIGNR PALIGNR_MMX
1722
PRED8x8L_DOWN_RIGHT sse2
1723
INIT_MMX
1724
%define PALIGNR PALIGNR_SSSE3
1725
PRED8x8L_DOWN_RIGHT ssse3
1726
%endif
1727

    
1728
;-----------------------------------------------------------------------------
1729
; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
1730
;-----------------------------------------------------------------------------
1731
%ifdef CONFIG_GPL
1732
INIT_MMX
1733
%define PALIGNR PALIGNR_MMX
1734
cglobal pred8x8l_vertical_right_mmxext, 4,5
1735
    sub          r0, r3
1736
    lea          r4, [r0+r3*2]
1737
    movq        mm0, [r0+r3*1-8]
1738
    punpckhbw   mm0, [r0+r3*0-8]
1739
    movq        mm1, [r4+r3*1-8]
1740
    punpckhbw   mm1, [r0+r3*2-8]
1741
    mov          r4, r0
1742
    punpckhwd   mm1, mm0
1743
    lea          r0, [r0+r3*4]
1744
    movq        mm2, [r0+r3*1-8]
1745
    punpckhbw   mm2, [r0+r3*0-8]
1746
    lea          r0, [r0+r3*2]
1747
    movq        mm3, [r0+r3*1-8]
1748
    punpckhbw   mm3, [r0+r3*0-8]
1749
    punpckhwd   mm3, mm2
1750
    punpckhdq   mm3, mm1
1751
    lea          r0, [r0+r3*2]
1752
    movq        mm0, [r0+r3*0-8]
1753
    movq        mm1, [r4]
1754
    mov          r0, r4
1755
    movq        mm4, mm3
1756
    movq        mm2, mm3
1757
    PALIGNR     mm4, mm0, 7, mm0
1758
    PALIGNR     mm1, mm2, 1, mm2
1759
    test        r1, r1
1760
    jz .fix_lt_1
1761
    jmp .do_left
1762
.fix_lt_1:
1763
    movq        mm5, mm3
1764
    pxor        mm5, mm4
1765
    psrlq       mm5, 56
1766
    psllq       mm5, 48
1767
    pxor        mm1, mm5
1768
    jmp .do_left
1769
.fix_lt_2:
1770
    movq        mm5, mm3
1771
    pxor        mm5, mm2
1772
    psllq       mm5, 56
1773
    psrlq       mm5, 56
1774
    pxor        mm2, mm5
1775
    test         r2, r2
1776
    jnz .do_top
1777
.fix_tr_1:
1778
    movq        mm5, mm3
1779
    pxor        mm5, mm1
1780
    psrlq       mm5, 56
1781
    psllq       mm5, 56
1782
    pxor        mm1, mm5
1783
    jmp .do_top
1784
.do_left:
1785
    movq        mm0, mm4
1786
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1787
    movq        mm7, mm2
1788
    movq        mm0, [r0-8]
1789
    movq        mm3, [r0]
1790
    movq        mm1, [r0+8]
1791
    movq        mm2, mm3
1792
    movq        mm4, mm3
1793
    PALIGNR     mm2, mm0, 7, mm0
1794
    PALIGNR     mm1, mm4, 1, mm4
1795
    test         r1, r1
1796
    jz .fix_lt_2
1797
    test         r2, r2
1798
    jz .fix_tr_1
1799
.do_top
1800
    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1801
    lea         r1, [r0+r3*2]
1802
    movq       mm2, mm6
1803
    movq       mm3, mm6
1804
    PALIGNR    mm3, mm7, 7, mm0
1805
    PALIGNR    mm6, mm7, 6, mm1
1806
    movq       mm4, mm3
1807
    pavgb      mm3, mm2
1808
    lea         r2, [r1+r3*2]
1809
    PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1810
    movq [r0+r3*1], mm3
1811
    movq [r0+r3*2], mm0
1812
    movq       mm5, mm0
1813
    movq       mm6, mm3
1814
    movq       mm1, mm7
1815
    movq       mm2, mm1
1816
    psllq      mm2, 8
1817
    movq       mm3, mm1
1818
    psllq      mm3, 16
1819
    lea         r4, [r2+r3*2]
1820
    PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1821
    PALIGNR    mm6, mm0, 7, mm2
1822
    movq [r1+r3*1], mm6
1823
    psllq      mm0, 8
1824
    PALIGNR    mm5, mm0, 7, mm1
1825
    movq [r1+r3*2], mm5
1826
    psllq      mm0, 8
1827
    PALIGNR    mm6, mm0, 7, mm2
1828
    movq [r2+r3*1], mm6
1829
    psllq      mm0, 8
1830
    PALIGNR    mm5, mm0, 7, mm1
1831
    movq [r2+r3*2], mm5
1832
    psllq      mm0, 8
1833
    PALIGNR    mm6, mm0, 7, mm2
1834
    movq [r4+r3*1], mm6
1835
    psllq      mm0, 8
1836
    PALIGNR    mm5, mm0, 7, mm1
1837
    movq [r4+r3*2], mm5
1838
    RET
1839

    
1840
%macro PRED8x8L_VERTICAL_RIGHT 1
1841
cglobal pred8x8l_vertical_right_%1, 4,5,7
1842
    sub          r0, r3
1843
    lea          r4, [r0+r3*2]
1844
    movq        mm0, [r0+r3*1-8]
1845
    punpckhbw   mm0, [r0+r3*0-8]
1846
    movq        mm1, [r4+r3*1-8]
1847
    punpckhbw   mm1, [r0+r3*2-8]
1848
    mov          r4, r0
1849
    punpckhwd   mm1, mm0
1850
    lea          r0, [r0+r3*4]
1851
    movq        mm2, [r0+r3*1-8]
1852
    punpckhbw   mm2, [r0+r3*0-8]
1853
    lea          r0, [r0+r3*2]
1854
    movq        mm3, [r0+r3*1-8]
1855
    punpckhbw   mm3, [r0+r3*0-8]
1856
    punpckhwd   mm3, mm2
1857
    punpckhdq   mm3, mm1
1858
    lea          r0, [r0+r3*2]
1859
    movq        mm0, [r0+r3*0-8]
1860
    movq        mm1, [r4]
1861
    mov          r0, r4
1862
    movq        mm4, mm3
1863
    movq        mm2, mm3
1864
    PALIGNR     mm4, mm0, 7, mm0
1865
    PALIGNR     mm1, mm2, 1, mm2
1866
    test        r1, r1
1867
    jnz .do_left
1868
.fix_lt_1:
1869
    movq        mm5, mm3
1870
    pxor        mm5, mm4
1871
    psrlq       mm5, 56
1872
    psllq       mm5, 48
1873
    pxor        mm1, mm5
1874
    jmp .do_left
1875
.fix_lt_2:
1876
    movq        mm5, mm3
1877
    pxor        mm5, mm2
1878
    psllq       mm5, 56
1879
    psrlq       mm5, 56
1880
    pxor        mm2, mm5
1881
    test         r2, r2
1882
    jnz .do_top
1883
.fix_tr_1:
1884
    movq        mm5, mm3
1885
    pxor        mm5, mm1
1886
    psrlq       mm5, 56
1887
    psllq       mm5, 56
1888
    pxor        mm1, mm5
1889
    jmp .do_top
1890
.do_left:
1891
    movq        mm0, mm4
1892
    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1893
    movq2dq    xmm0, mm2
1894
    movq        mm0, [r0-8]
1895
    movq        mm3, [r0]
1896
    movq        mm1, [r0+8]
1897
    movq        mm2, mm3
1898
    movq        mm4, mm3
1899
    PALIGNR     mm2, mm0, 7, mm0
1900
    PALIGNR     mm1, mm4, 1, mm4
1901
    test         r1, r1
1902
    jz .fix_lt_2
1903
    test         r2, r2
1904
    jz .fix_tr_1
1905
.do_top
1906
    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1907
    lea           r1, [r0+r3*2]
1908
    movq2dq     xmm4, mm6
1909
    pslldq      xmm4, 8
1910
    por         xmm0, xmm4
1911
    movdqa      xmm6, [pw_ff00]
1912
    movdqa      xmm1, xmm0
1913
    lea           r2, [r1+r3*2]
1914
    movdqa      xmm2, xmm0
1915
    movdqa      xmm3, xmm0
1916
    pslldq      xmm0, 1
1917
    pslldq      xmm1, 2
1918
    pavgb       xmm2, xmm0
1919
INIT_XMM
1920
    PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
1921
    pandn       xmm6, xmm4
1922
    movdqa      xmm5, xmm4
1923
    psrlw       xmm4, 8
1924
    packuswb    xmm6, xmm4
1925
    movhlps     xmm4, xmm6
1926
    movhps [r0+r3*2], xmm5
1927
    movhps [r0+r3*1], xmm2
1928
    psrldq      xmm5, 4
1929
    movss       xmm5, xmm6
1930
    psrldq      xmm2, 4
1931
    movss       xmm2, xmm4
1932
    lea           r0, [r2+r3*2]
1933
    psrldq      xmm5, 1
1934
    psrldq      xmm2, 1
1935
    movq        [r0+r3*2], xmm5
1936
    movq        [r0+r3*1], xmm2
1937
    psrldq      xmm5, 1
1938
    psrldq      xmm2, 1
1939
    movq        [r2+r3*2], xmm5
1940
    movq        [r2+r3*1], xmm2
1941
    psrldq      xmm5, 1
1942
    psrldq      xmm2, 1
1943
    movq        [r1+r3*2], xmm5
1944
    movq        [r1+r3*1], xmm2
1945
    RET
1946
%endmacro
1947

    
1948
INIT_MMX
1949
%define PALIGNR PALIGNR_MMX
1950
PRED8x8L_VERTICAL_RIGHT sse2
1951
INIT_MMX
1952
%define PALIGNR PALIGNR_SSSE3
1953
PRED8x8L_VERTICAL_RIGHT ssse3
1954
%endif
1955

    
1956
;-----------------------------------------------------------------------------
1957
;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1958
;-----------------------------------------------------------------------------
1959
%ifdef CONFIG_GPL
1960
%macro PRED8x8L_VERTICAL_LEFT 1
1961
cglobal pred8x8l_vertical_left_%1, 4,4
1962
    sub          r0, r3
1963
    movq        mm0, [r0-8]
1964
    movq        mm3, [r0]
1965
    movq        mm1, [r0+8]
1966
    movq        mm2, mm3
1967
    movq        mm4, mm3
1968
    PALIGNR     mm2, mm0, 7, mm0
1969
    PALIGNR     mm1, mm4, 1, mm4
1970
    test         r1, r1
1971
    jz .fix_lt_2
1972
    test         r2, r2
1973
    jz .fix_tr_1
1974
    jmp .do_top
1975
.fix_lt_2:
1976
    movq        mm5, mm3
1977
    pxor        mm5, mm2
1978
    psllq       mm5, 56
1979
    psrlq       mm5, 56
1980
    pxor        mm2, mm5
1981
    test         r2, r2
1982
    jnz .do_top
1983
.fix_tr_1:
1984
    movq        mm5, mm3
1985
    pxor        mm5, mm1
1986
    psrlq       mm5, 56
1987
    psllq       mm5, 56
1988
    pxor        mm1, mm5
1989
    jmp .do_top
1990
.fix_tr_2:
1991
    punpckhbw   mm3, mm3
1992
    pshufw      mm1, mm3, 0xFF
1993
    jmp .do_topright
1994
.do_top:
1995
    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1996
    movq2dq    xmm4, mm4
1997
    test         r2, r2
1998
    jz .fix_tr_2
1999
    movq        mm0, [r0+8]
2000
    movq        mm5, mm0
2001
    movq        mm2, mm0
2002
    movq        mm4, mm0
2003
    psrlq       mm5, 56
2004
    PALIGNR     mm2, mm3, 7, mm3
2005
    PALIGNR     mm5, mm4, 1, mm4
2006
    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2007
.do_topright:
2008
    movq2dq   xmm3, mm1
2009
    lea         r1, [r0+r3*2]
2010
    pslldq    xmm3, 8
2011
    por       xmm4, xmm3
2012
    movdqa    xmm2, xmm4
2013
    movdqa    xmm1, xmm4
2014
    movdqa    xmm3, xmm4
2015
    psrldq    xmm2, 1
2016
    pslldq    xmm1, 1
2017
    pavgb     xmm3, xmm2
2018
    lea         r2, [r1+r3*2]
2019
INIT_XMM
2020
    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2021
    psrldq    xmm0, 1
2022
    movq [r0+r3*1], xmm3
2023
    movq [r0+r3*2], xmm0
2024
    lea         r0, [r2+r3*2]
2025
    psrldq    xmm3, 1
2026
    psrldq    xmm0, 1
2027
    movq [r1+r3*1], xmm3
2028
    movq [r1+r3*2], xmm0
2029
    psrldq    xmm3, 1
2030
    psrldq    xmm0, 1
2031
    movq [r2+r3*1], xmm3
2032
    movq [r2+r3*2], xmm0
2033
    psrldq    xmm3, 1
2034
    psrldq    xmm0, 1
2035
    movq [r0+r3*1], xmm3
2036
    movq [r0+r3*2], xmm0
2037
    RET
2038
%endmacro
2039

    
2040
INIT_MMX
2041
%define PALIGNR PALIGNR_MMX
2042
PRED8x8L_VERTICAL_LEFT sse2
2043
%define PALIGNR PALIGNR_SSSE3
2044
INIT_MMX
2045
PRED8x8L_VERTICAL_LEFT ssse3
2046
%endif
2047

    
2048
;-----------------------------------------------------------------------------
2049
; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2050
;-----------------------------------------------------------------------------
2051

    
2052
cglobal pred4x4_dc_mmxext, 3,5
2053
    pxor   mm7, mm7
2054
    mov     r4, r0
2055
    sub     r0, r2
2056
    movd   mm0, [r0]
2057
    psadbw mm0, mm7
2058
    movzx  r1d, byte [r0+r2*1-1]
2059
    movd   r3d, mm0
2060
    add    r3d, r1d
2061
    movzx  r1d, byte [r0+r2*2-1]
2062
    lea     r0, [r0+r2*2]
2063
    add    r3d, r1d
2064
    movzx  r1d, byte [r0+r2*1-1]
2065
    add    r3d, r1d
2066
    movzx  r1d, byte [r0+r2*2-1]
2067
    add    r3d, r1d
2068
    add    r3d, 4
2069
    shr    r3d, 3
2070
    imul   r3d, 0x01010101
2071
    mov   [r4+r2*0], r3d
2072
    mov   [r0+r2*0], r3d
2073
    mov   [r0+r2*1], r3d
2074
    mov   [r0+r2*2], r3d
2075
    RET
2076

    
2077
;-----------------------------------------------------------------------------
2078
; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2079
;-----------------------------------------------------------------------------
2080

    
2081
%macro PRED4x4_TM_MMX 1
2082
cglobal pred4x4_tm_vp8_%1, 3,6
2083
    sub        r0, r2
2084
    pxor      mm7, mm7
2085
    movd      mm0, [r0]
2086
    punpcklbw mm0, mm7
2087
    movzx     r4d, byte [r0-1]
2088
    mov       r5d, 2
2089
.loop:
2090
    movzx     r1d, byte [r0+r2*1-1]
2091
    movzx     r3d, byte [r0+r2*2-1]
2092
    sub       r1d, r4d
2093
    sub       r3d, r4d
2094
    movd      mm2, r1d
2095
    movd      mm4, r3d
2096
%ifidn %1, mmx
2097
    punpcklwd mm2, mm2
2098
    punpcklwd mm4, mm4
2099
    punpckldq mm2, mm2
2100
    punpckldq mm4, mm4
2101
%else
2102
    pshufw    mm2, mm2, 0
2103
    pshufw    mm4, mm4, 0
2104
%endif
2105
    paddw     mm2, mm0
2106
    paddw     mm4, mm0
2107
    packuswb  mm2, mm2
2108
    packuswb  mm4, mm4
2109
    movd [r0+r2*1], mm2
2110
    movd [r0+r2*2], mm4
2111
    lea        r0, [r0+r2*2]
2112
    dec       r5d
2113
    jg .loop
2114
    REP_RET
2115
%endmacro
2116

    
2117
PRED4x4_TM_MMX mmx
2118
PRED4x4_TM_MMX mmxext
2119

    
2120
cglobal pred4x4_tm_vp8_ssse3, 3,3
2121
    sub         r0, r2
2122
    movq       mm6, [tm_shuf]
2123
    pxor       mm1, mm1
2124
    movd       mm0, [r0]
2125
    punpcklbw  mm0, mm1
2126
    movd       mm7, [r0-4]
2127
    pshufb     mm7, mm6
2128
    lea         r1, [r0+r2*2]
2129
    movd       mm2, [r0+r2*1-4]
2130
    movd       mm3, [r0+r2*2-4]
2131
    movd       mm4, [r1+r2*1-4]
2132
    movd       mm5, [r1+r2*2-4]
2133
    pshufb     mm2, mm6
2134
    pshufb     mm3, mm6
2135
    pshufb     mm4, mm6
2136
    pshufb     mm5, mm6
2137
    psubw      mm2, mm7
2138
    psubw      mm3, mm7
2139
    psubw      mm4, mm7
2140
    psubw      mm5, mm7
2141
    paddw      mm2, mm0
2142
    paddw      mm3, mm0
2143
    paddw      mm4, mm0
2144
    paddw      mm5, mm0
2145
    packuswb   mm2, mm2
2146
    packuswb   mm3, mm3
2147
    packuswb   mm4, mm4
2148
    packuswb   mm5, mm5
2149
    movd [r0+r2*1], mm2
2150
    movd [r0+r2*2], mm3
2151
    movd [r1+r2*1], mm4
2152
    movd [r1+r2*2], mm5
2153
    RET
2154

    
2155
;-----------------------------------------------------------------------------
2156
; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2157
;-----------------------------------------------------------------------------
2158

    
2159
INIT_MMX
2160
cglobal pred4x4_vertical_vp8_mmxext, 3,3
2161
    sub       r0, r2
2162
    movd      m1, [r0-1]
2163
    movd      m0, [r0]
2164
    mova      m2, m0   ;t0 t1 t2 t3
2165
    punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2166
    lea       r1, [r0+r2*2]
2167
    psrlq     m0, 8    ;t1 t2 t3 t4
2168
    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2169
    movd [r0+r2*1], m3
2170
    movd [r0+r2*2], m3
2171
    movd [r1+r2*1], m3
2172
    movd [r1+r2*2], m3
2173
    RET
2174

    
2175
;-----------------------------------------------------------------------------
2176
; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2177
;-----------------------------------------------------------------------------
2178
%ifdef CONFIG_GPL
2179
INIT_MMX
2180
cglobal pred4x4_down_left_mmxext, 3,3
2181
    sub       r0, r2
2182
    movq      m1, [r0]
2183
    punpckldq m1, [r1]
2184
    movq      m2, m1
2185
    movq      m3, m1
2186
    movq      m4, m1
2187
    psllq     m1, 8
2188
    pxor      m2, m1
2189
    psrlq     m2, 8
2190
    pxor      m3, m2
2191
    PRED4x4_LOWPASS m0, m1, m3, m4, m5
2192
    lea       r1, [r0+r2*2]
2193
    psrlq     m0, 8
2194
    movd      [r0+r2*1], m0
2195
    psrlq     m0, 8
2196
    movd      [r0+r2*2], m0
2197
    psrlq     m0, 8
2198
    movd      [r1+r2*1], m0
2199
    psrlq     m0, 8
2200
    movd      [r1+r2*2], m0
2201
    RET
2202
%endif