Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_chromamc.asm @ 888fa31e

History | View | Annotate | Download (17.4 KB)

1
;******************************************************************************
2
;* MMX/SSSE3-optimized functions for H264 chroma MC
3
;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
4
;*               2005-2008 Loren Merritt
5
;*
6
;* This file is part of Libav.
7
;*
8
;* Libav is free software; you can redistribute it and/or
9
;* modify it under the terms of the GNU Lesser General Public
10
;* License as published by the Free Software Foundation; either
11
;* version 2.1 of the License, or (at your option) any later version.
12
;*
13
;* Libav is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
;* Lesser General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU Lesser General Public
19
;* License along with Libav; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
;******************************************************************************
22

    
23
%include "x86inc.asm"
24
%include "x86util.asm"
25

    
26
SECTION_RODATA
27

    
28
rnd_rv40_2d_tbl: times 4 dw  0
29
                 times 4 dw 16
30
                 times 4 dw 32
31
                 times 4 dw 16
32
                 times 4 dw 32
33
                 times 4 dw 28
34
                 times 4 dw 32
35
                 times 4 dw 28
36
                 times 4 dw  0
37
                 times 4 dw 32
38
                 times 4 dw 16
39
                 times 4 dw 32
40
                 times 4 dw 32
41
                 times 4 dw 28
42
                 times 4 dw 32
43
                 times 4 dw 28
44
rnd_rv40_1d_tbl: times 4 dw  0
45
                 times 4 dw  2
46
                 times 4 dw  4
47
                 times 4 dw  2
48
                 times 4 dw  4
49
                 times 4 dw  3
50
                 times 4 dw  4
51
                 times 4 dw  3
52
                 times 4 dw  0
53
                 times 4 dw  4
54
                 times 4 dw  2
55
                 times 4 dw  4
56
                 times 4 dw  4
57
                 times 4 dw  3
58
                 times 4 dw  4
59
                 times 4 dw  3
60

    
61
cextern pw_3
62
cextern pw_4
63
cextern pw_8
64
cextern pw_28
65
cextern pw_32
66
cextern pw_64
67

    
68
SECTION .text
69

    
70
%macro mv0_pixels_mc8 0
71
    lea           r4, [r2*2 ]
72
.next4rows
73
    movq         mm0, [r1   ]
74
    movq         mm1, [r1+r2]
75
    CHROMAMC_AVG mm0, [r0   ]
76
    CHROMAMC_AVG mm1, [r0+r2]
77
    movq     [r0   ], mm0
78
    movq     [r0+r2], mm1
79
    add           r0, r4
80
    add           r1, r4
81
    movq         mm0, [r1   ]
82
    movq         mm1, [r1+r2]
83
    CHROMAMC_AVG mm0, [r0   ]
84
    CHROMAMC_AVG mm1, [r0+r2]
85
    add           r1, r4
86
    movq     [r0   ], mm0
87
    movq     [r0+r2], mm1
88
    add           r0, r4
89
    sub          r3d, 4
90
    jne .next4rows
91
%endmacro
92

    
93
%macro chroma_mc8_mmx_func 3
94
; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
95
;                              int stride, int h, int mx, int my)
96
cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
97
%ifdef ARCH_X86_64
98
    movsxd        r2, r2d
99
%endif
100
    mov          r6d, r5d
101
    or           r6d, r4d
102
    jne .at_least_one_non_zero
103
    ; mx == 0 AND my == 0 - no filter needed
104
    mv0_pixels_mc8
105
    REP_RET
106

    
107
.at_least_one_non_zero
108
%ifidn %2, rv40
109
%ifdef PIC
110
%define rnd_1d_rv40 r11
111
%define rnd_2d_rv40 r11
112
%else ; no-PIC
113
%define rnd_1d_rv40 rnd_rv40_1d_tbl
114
%define rnd_2d_rv40 rnd_rv40_2d_tbl
115
%endif
116
%ifdef ARCH_X86_64
117
    mov          r10, r5
118
    and          r10, 6         ; &~1 for mx/my=[0,7]
119
    lea          r10, [r10*4+r4]
120
    sar         r10d, 1
121
%define rnd_bias r10
122
%define dest_reg r0
123
%else ; x86-32
124
    mov           r0, r5
125
    and           r0, 6         ; &~1 for mx/my=[0,7]
126
    lea           r0, [r0*4+r4]
127
    sar          r0d, 1
128
%define rnd_bias r0
129
%define dest_reg r5
130
%endif
131
%else ; vc1, h264
132
%define rnd_bias  0
133
%define dest_reg r0
134
%endif
135

    
136
    test         r5d, r5d
137
    mov           r6, 1
138
    je .my_is_zero
139
    test         r4d, r4d
140
    mov           r6, r2        ; dxy = x ? 1 : stride
141
    jne .both_non_zero
142
.my_is_zero
143
    ; mx == 0 XOR my == 0 - 1 dimensional filter only
144
    or           r4d, r5d       ; x + y
145

    
146
%ifidn %2, rv40
147
%ifdef PIC
148
    lea          r11, [rnd_rv40_1d_tbl]
149
%endif
150
%ifndef ARCH_X86_64
151
    mov           r5, r0m
152
%endif
153
%endif
154

    
155
    movd          m5, r4d
156
    movq          m4, [pw_8]
157
    movq          m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
158
    punpcklwd     m5, m5
159
    punpckldq     m5, m5        ; mm5 = B = x
160
    pxor          m7, m7
161
    psubw         m4, m5        ; mm4 = A = 8-x
162

    
163
.next1drow
164
    movq          m0, [r1   ]   ; mm0 = src[0..7]
165
    movq          m2, [r1+r6]   ; mm1 = src[1..8]
166

    
167
    movq          m1, m0
168
    movq          m3, m2
169
    punpcklbw     m0, m7
170
    punpckhbw     m1, m7
171
    punpcklbw     m2, m7
172
    punpckhbw     m3, m7
173
    pmullw        m0, m4        ; [mm0,mm1] = A * src[0..7]
174
    pmullw        m1, m4
175
    pmullw        m2, m5        ; [mm2,mm3] = B * src[1..8]
176
    pmullw        m3, m5
177

    
178
    paddw         m0, m6
179
    paddw         m1, m6
180
    paddw         m0, m2
181
    paddw         m1, m3
182
    psrlw         m0, 3
183
    psrlw         m1, 3
184
    packuswb      m0, m1
185
    CHROMAMC_AVG  m0, [dest_reg]
186
    movq  [dest_reg], m0        ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
187

    
188
    add     dest_reg, r2
189
    add           r1, r2
190
    dec           r3d
191
    jne .next1drow
192
    REP_RET
193

    
194
.both_non_zero ; general case, bilinear
195
    movd          m4, r4d         ; x
196
    movd          m6, r5d         ; y
197
%ifidn %2, rv40
198
%ifdef PIC
199
    lea          r11, [rnd_rv40_2d_tbl]
200
%endif
201
%ifndef ARCH_X86_64
202
    mov           r5, r0m
203
%endif
204
%endif
205
    mov           r6, rsp         ; backup stack pointer
206
    and          rsp, ~(mmsize-1) ; align stack
207
    sub          rsp, 16          ; AA and DD
208

    
209
    punpcklwd     m4, m4
210
    punpcklwd     m6, m6
211
    punpckldq     m4, m4          ; mm4 = x words
212
    punpckldq     m6, m6          ; mm6 = y words
213
    movq          m5, m4
214
    pmullw        m4, m6          ; mm4 = x * y
215
    psllw         m5, 3
216
    psllw         m6, 3
217
    movq          m7, m5
218
    paddw         m7, m6
219
    movq     [rsp+8], m4          ; DD = x * y
220
    psubw         m5, m4          ; mm5 = B = 8x - xy
221
    psubw         m6, m4          ; mm6 = C = 8y - xy
222
    paddw         m4, [pw_64]
223
    psubw         m4, m7          ; mm4 = A = xy - (8x+8y) + 64
224
    pxor          m7, m7
225
    movq     [rsp  ], m4
226

    
227
    movq          m0, [r1  ]      ; mm0 = src[0..7]
228
    movq          m1, [r1+1]      ; mm1 = src[1..8]
229
.next2drow
230
    add           r1, r2
231

    
232
    movq          m2, m0
233
    movq          m3, m1
234
    punpckhbw     m0, m7
235
    punpcklbw     m1, m7
236
    punpcklbw     m2, m7
237
    punpckhbw     m3, m7
238
    pmullw        m0, [rsp]
239
    pmullw        m2, [rsp]
240
    pmullw        m1, m5
241
    pmullw        m3, m5
242
    paddw         m2, m1          ; mm2 = A * src[0..3] + B * src[1..4]
243
    paddw         m3, m0          ; mm3 = A * src[4..7] + B * src[5..8]
244

    
245
    movq          m0, [r1]
246
    movq          m1, m0
247
    punpcklbw     m0, m7
248
    punpckhbw     m1, m7
249
    pmullw        m0, m6
250
    pmullw        m1, m6
251
    paddw         m2, m0
252
    paddw         m3, m1          ; [mm2,mm3] += C * src[0..7]
253

    
254
    movq          m1, [r1+1]
255
    movq          m0, m1
256
    movq          m4, m1
257
    punpcklbw     m0, m7
258
    punpckhbw     m4, m7
259
    pmullw        m0, [rsp+8]
260
    pmullw        m4, [rsp+8]
261
    paddw         m2, m0
262
    paddw         m3, m4          ; [mm2,mm3] += D * src[1..8]
263
    movq          m0, [r1]
264

    
265
    paddw         m2, [rnd_2d_%2+rnd_bias*8]
266
    paddw         m3, [rnd_2d_%2+rnd_bias*8]
267
    psrlw         m2, 6
268
    psrlw         m3, 6
269
    packuswb      m2, m3
270
    CHROMAMC_AVG  m2, [dest_reg]
271
    movq  [dest_reg], m2          ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
272

    
273
    add     dest_reg, r2
274
    dec          r3d
275
    jne .next2drow
276
    mov          rsp, r6          ; restore stack pointer
277
    RET
278
%endmacro
279

    
280
%macro chroma_mc4_mmx_func 3
281
cglobal %1_%2_chroma_mc4_%3, 6, 6, 0
282
%ifdef ARCH_X86_64
283
    movsxd        r2, r2d
284
%endif
285
    pxor          m7, m7
286
    movd          m2, r4d         ; x
287
    movd          m3, r5d         ; y
288
    movq          m4, [pw_8]
289
    movq          m5, [pw_8]
290
    punpcklwd     m2, m2
291
    punpcklwd     m3, m3
292
    punpcklwd     m2, m2
293
    punpcklwd     m3, m3
294
    psubw         m4, m2
295
    psubw         m5, m3
296

    
297
%ifidn %2, rv40
298
%ifdef PIC
299
   lea           r11, [rnd_rv40_2d_tbl]
300
%define rnd_2d_rv40 r11
301
%else
302
%define rnd_2d_rv40 rnd_rv40_2d_tbl
303
%endif
304
    and           r5, 6         ; &~1 for mx/my=[0,7]
305
    lea           r5, [r5*4+r4]
306
    sar          r5d, 1
307
%define rnd_bias r5
308
%else ; vc1, h264
309
%define rnd_bias 0
310
%endif
311

    
312
    movd          m0, [r1  ]
313
    movd          m6, [r1+1]
314
    add           r1, r2
315
    punpcklbw     m0, m7
316
    punpcklbw     m6, m7
317
    pmullw        m0, m4
318
    pmullw        m6, m2
319
    paddw         m6, m0
320

    
321
.next2rows
322
    movd          m0, [r1  ]
323
    movd          m1, [r1+1]
324
    add           r1, r2
325
    punpcklbw     m0, m7
326
    punpcklbw     m1, m7
327
    pmullw        m0, m4
328
    pmullw        m1, m2
329
    paddw         m1, m0
330
    movq          m0, m1
331

    
332
    pmullw        m6, m5
333
    pmullw        m1, m3
334
    paddw         m6, [rnd_2d_%2+rnd_bias*8]
335
    paddw         m1, m6
336
    psrlw         m1, 6
337
    packuswb      m1, m1
338
    CHROMAMC_AVG4 m1, m6, [r0]
339
    movd        [r0], m1
340
    add           r0, r2
341

    
342
    movd          m6, [r1  ]
343
    movd          m1, [r1+1]
344
    add           r1, r2
345
    punpcklbw     m6, m7
346
    punpcklbw     m1, m7
347
    pmullw        m6, m4
348
    pmullw        m1, m2
349
    paddw         m1, m6
350
    movq          m6, m1
351
    pmullw        m0, m5
352
    pmullw        m1, m3
353
    paddw         m0, [rnd_2d_%2+rnd_bias*8]
354
    paddw         m1, m0
355
    psrlw         m1, 6
356
    packuswb      m1, m1
357
    CHROMAMC_AVG4 m1, m0, [r0]
358
    movd        [r0], m1
359
    add           r0, r2
360
    sub          r3d, 2
361
    jnz .next2rows
362
    REP_RET
363
%endmacro
364

    
365
%macro chroma_mc2_mmx_func 3
366
cglobal %1_%2_chroma_mc2_%3, 6, 7, 0
367
%ifdef ARCH_X86_64
368
    movsxd        r2, r2d
369
%endif
370

    
371
    mov          r6d, r4d
372
    shl          r4d, 16
373
    sub          r4d, r6d
374
    add          r4d, 8
375
    imul         r5d, r4d         ; x*y<<16 | y*(8-x)
376
    shl          r4d, 3
377
    sub          r4d, r5d         ; x*(8-y)<<16 | (8-x)*(8-y)
378

    
379
    movd          m5, r4d
380
    movd          m6, r5d
381
    punpckldq     m5, m5          ; mm5 = {A,B,A,B}
382
    punpckldq     m6, m6          ; mm6 = {C,D,C,D}
383
    pxor          m7, m7
384
    movd          m2, [r1]
385
    punpcklbw     m2, m7
386
    pshufw        m2, m2, 0x94    ; mm0 = src[0,1,1,2]
387

    
388
.nextrow
389
    add           r1, r2
390
    movq          m1, m2
391
    pmaddwd       m1, m5          ; mm1 = A * src[0,1] + B * src[1,2]
392
    movd          m0, [r1]
393
    punpcklbw     m0, m7
394
    pshufw        m0, m0, 0x94    ; mm0 = src[0,1,1,2]
395
    movq          m2, m0
396
    pmaddwd       m0, m6
397
    paddw         m1, [rnd_2d_%2]
398
    paddw         m1, m0          ; mm1 += C * src[0,1] + D * src[1,2]
399
    psrlw         m1, 6
400
    packssdw      m1, m7
401
    packuswb      m1, m7
402
    CHROMAMC_AVG4 m1, m3, [r0]
403
    movd         r5d, m1
404
    mov         [r0], r5w
405
    add           r0, r2
406
    sub          r3d, 1
407
    jnz .nextrow
408
    REP_RET
409
%endmacro
410

    
411
%define rnd_1d_h264 pw_4
412
%define rnd_2d_h264 pw_32
413
%define rnd_1d_vc1  pw_3
414
%define rnd_2d_vc1  pw_28
415

    
416
%macro NOTHING 2-3
417
%endmacro
418
%macro DIRECT_AVG 2
419
    PAVG          %1, %2
420
%endmacro
421
%macro COPY_AVG 3
422
    movd          %2, %3
423
    PAVG          %1, %2
424
%endmacro
425

    
426
INIT_MMX
427
%define CHROMAMC_AVG  NOTHING
428
%define CHROMAMC_AVG4 NOTHING
429
chroma_mc8_mmx_func put, h264, mmx_rnd
430
chroma_mc8_mmx_func put, vc1,  mmx_nornd
431
chroma_mc8_mmx_func put, rv40, mmx
432
chroma_mc4_mmx_func put, h264, mmx
433
chroma_mc4_mmx_func put, rv40, mmx
434
chroma_mc2_mmx_func put, h264, mmx2
435

    
436
%define CHROMAMC_AVG  DIRECT_AVG
437
%define CHROMAMC_AVG4 COPY_AVG
438
%define PAVG          pavgb
439
chroma_mc8_mmx_func avg, h264, mmx2_rnd
440
chroma_mc8_mmx_func avg, vc1,  mmx2_nornd
441
chroma_mc8_mmx_func avg, rv40, mmx2
442
chroma_mc4_mmx_func avg, h264, mmx2
443
chroma_mc4_mmx_func avg, rv40, mmx2
444
chroma_mc2_mmx_func avg, h264, mmx2
445

    
446
%define PAVG          pavgusb
447
chroma_mc8_mmx_func avg, h264, 3dnow_rnd
448
chroma_mc8_mmx_func avg, vc1,  3dnow_nornd
449
chroma_mc8_mmx_func avg, rv40, 3dnow
450
chroma_mc4_mmx_func avg, h264, 3dnow
451
chroma_mc4_mmx_func avg, rv40, 3dnow
452

    
453
%macro chroma_mc8_ssse3_func 3
454
cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
455
%ifdef ARCH_X86_64
456
    movsxd        r2, r2d
457
%endif
458
    mov          r6d, r5d
459
    or           r6d, r4d
460
    jne .at_least_one_non_zero
461
    ; mx == 0 AND my == 0 - no filter needed
462
    mv0_pixels_mc8
463
    REP_RET
464

    
465
.at_least_one_non_zero
466
    test         r5d, r5d
467
    je .my_is_zero
468
    test         r4d, r4d
469
    je .mx_is_zero
470

    
471
    ; general case, bilinear
472
    mov          r6d, r4d
473
    shl          r4d, 8
474
    sub           r4, r6
475
    add           r4, 8           ; x*288+8 = x<<8 | (8-x)
476
    mov           r6, 8
477
    sub          r6d, r5d
478
    imul          r6, r4          ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
479
    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
480

    
481
    movd          m7, r6d
482
    movd          m6, r4d
483
    movdqa        m5, [rnd_2d_%2]
484
    pshuflw       m7, m7, 0
485
    pshuflw       m6, m6, 0
486
    movlhps       m7, m7
487
    movlhps       m6, m6
488

    
489
    movq          m0, [r1     ]
490
    movq          m1, [r1   +1]
491
    punpcklbw     m0, m1
492
    add           r1, r2
493
.next2rows
494
    movq          m1, [r1     ]
495
    movq          m2, [r1   +1]
496
    movq          m3, [r1+r2  ]
497
    movq          m4, [r1+r2+1]
498
    lea           r1, [r1+r2*2]
499
    punpcklbw     m1, m2
500
    punpcklbw     m3, m4
501
    movdqa        m2, m1
502
    movdqa        m4, m3
503
    pmaddubsw     m0, m7
504
    pmaddubsw     m1, m6
505
    pmaddubsw     m2, m7
506
    pmaddubsw     m3, m6
507
    paddw         m0, m5
508
    paddw         m2, m5
509
    paddw         m1, m0
510
    paddw         m3, m2
511
    movdqa        m0, m4
512
    psrlw         m1, 6
513
    psrlw         m3, 6
514
%ifidn %1, avg
515
    movq          m2, [r0   ]
516
    movhps        m2, [r0+r2]
517
%endif
518
    packuswb      m1, m3
519
    CHROMAMC_AVG  m1, m2
520
    movq     [r0   ], m1
521
    movhps   [r0+r2], m1
522
    sub          r3d, 2
523
    lea           r0, [r0+r2*2]
524
    jg .next2rows
525
    REP_RET
526

    
527
.my_is_zero
528
    mov          r5d, r4d
529
    shl          r4d, 8
530
    add           r4, 8
531
    sub           r4, r5          ; 255*x+8 = x<<8 | (8-x)
532
    movd          m7, r4d
533
    movdqa        m6, [rnd_1d_%2]
534
    pshuflw       m7, m7, 0
535
    movlhps       m7, m7
536

    
537
.next2xrows
538
    movq          m0, [r1     ]
539
    movq          m1, [r1   +1]
540
    movq          m2, [r1+r2  ]
541
    movq          m3, [r1+r2+1]
542
    punpcklbw     m0, m1
543
    punpcklbw     m2, m3
544
    pmaddubsw     m0, m7
545
    pmaddubsw     m2, m7
546
%ifidn %1, avg
547
    movq          m4, [r0   ]
548
    movhps        m4, [r0+r2]
549
%endif
550
    paddw         m0, m6
551
    paddw         m2, m6
552
    psrlw         m0, 3
553
    psrlw         m2, 3
554
    packuswb      m0, m2
555
    CHROMAMC_AVG  m0, m4
556
    movq     [r0   ], m0
557
    movhps   [r0+r2], m0
558
    sub          r3d, 2
559
    lea           r0, [r0+r2*2]
560
    lea           r1, [r1+r2*2]
561
    jg .next2xrows
562
    REP_RET
563

    
564
.mx_is_zero
565
    mov          r4d, r5d
566
    shl          r5d, 8
567
    add           r5, 8
568
    sub           r5, r4          ; 255*y+8 = y<<8 | (8-y)
569
    movd          m7, r5d
570
    movdqa        m6, [rnd_1d_%2]
571
    pshuflw       m7, m7, 0
572
    movlhps       m7, m7
573

    
574
.next2yrows
575
    movq          m0, [r1     ]
576
    movq          m1, [r1+r2  ]
577
    movdqa        m2, m1
578
    movq          m3, [r1+r2*2]
579
    punpcklbw     m0, m1
580
    punpcklbw     m2, m3
581
    pmaddubsw     m0, m7
582
    pmaddubsw     m2, m7
583
%ifidn %1, avg
584
    movq          m4, [r0   ]
585
    movhps        m4, [r0+r2]
586
%endif
587
    paddw         m0, m6
588
    paddw         m2, m6
589
    psrlw         m0, 3
590
    psrlw         m2, 3
591
    packuswb      m0, m2
592
    CHROMAMC_AVG  m0, m4
593
    movq     [r0   ], m0
594
    movhps   [r0+r2], m0
595
    sub          r3d, 2
596
    lea           r0, [r0+r2*2]
597
    lea           r1, [r1+r2*2]
598
    jg .next2yrows
599
    REP_RET
600
%endmacro
601

    
602
%macro chroma_mc4_ssse3_func 3
603
cglobal %1_%2_chroma_mc4_%3, 6, 7, 0
604
%ifdef ARCH_X86_64
605
    movsxd        r2, r2d
606
%endif
607
    mov           r6, r4
608
    shl          r4d, 8
609
    sub          r4d, r6d
610
    add          r4d, 8           ; x*288+8
611
    mov           r6, 8
612
    sub          r6d, r5d
613
    imul         r6d, r4d         ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
614
    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
615

    
616
    movd          m7, r6d
617
    movd          m6, r4d
618
    movq          m5, [pw_32]
619
    pshufw        m7, m7, 0
620
    pshufw        m6, m6, 0
621

    
622
    movd          m0, [r1     ]
623
    punpcklbw     m0, [r1   +1]
624
    add           r1, r2
625
.next2rows
626
    movd          m1, [r1     ]
627
    movd          m3, [r1+r2  ]
628
    punpcklbw     m1, [r1   +1]
629
    punpcklbw     m3, [r1+r2+1]
630
    lea           r1, [r1+r2*2]
631
    movq          m2, m1
632
    movq          m4, m3
633
    pmaddubsw     m0, m7
634
    pmaddubsw     m1, m6
635
    pmaddubsw     m2, m7
636
    pmaddubsw     m3, m6
637
    paddw         m0, m5
638
    paddw         m2, m5
639
    paddw         m1, m0
640
    paddw         m3, m2
641
    movq          m0, m4
642
    psrlw         m1, 6
643
    psrlw         m3, 6
644
    packuswb      m1, m1
645
    packuswb      m3, m3
646
    CHROMAMC_AVG  m1, [r0  ]
647
    CHROMAMC_AVG  m3, [r0+r2]
648
    movd     [r0   ], m1
649
    movd     [r0+r2], m3
650
    sub          r3d, 2
651
    lea           r0, [r0+r2*2]
652
    jg .next2rows
653
    REP_RET
654
%endmacro
655

    
656
%define CHROMAMC_AVG NOTHING
657
INIT_XMM
658
chroma_mc8_ssse3_func put, h264, ssse3_rnd
659
chroma_mc8_ssse3_func put, vc1,  ssse3_nornd
660
INIT_MMX
661
chroma_mc4_ssse3_func put, h264, ssse3
662

    
663
%define CHROMAMC_AVG DIRECT_AVG
664
%define PAVG         pavgb
665
INIT_XMM
666
chroma_mc8_ssse3_func avg, h264, ssse3_rnd
667
chroma_mc8_ssse3_func avg, vc1,  ssse3_nornd
668
INIT_MMX
669
chroma_mc4_ssse3_func avg, h264, ssse3