Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_chromamc.asm @ ae112918

History | View | Annotate | Download (17.4 KB)

1
;******************************************************************************
2
;* MMX/SSSE3-optimized functions for H264 chroma MC
3
;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
4
;*               2005-2008 Loren Merritt
5
;*
6
;* This file is part of FFmpeg.
7
;*
8
;* FFmpeg is free software; you can redistribute it and/or
9
;* modify it under the terms of the GNU Lesser General Public
10
;* License as published by the Free Software Foundation; either
11
;* version 2.1 of the License, or (at your option) any later version.
12
;*
13
;* FFmpeg is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
;* Lesser General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU Lesser General Public
19
;* License along with FFmpeg; if not, write to the Free Software
20
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
;******************************************************************************
22

    
23
%include "x86inc.asm"
24
%include "x86util.asm"
25

    
26
SECTION_RODATA
27

    
28
rnd_rv40_2d_tbl: times 4 dw  0
29
                 times 4 dw 16
30
                 times 4 dw 32
31
                 times 4 dw 16
32
                 times 4 dw 32
33
                 times 4 dw 28
34
                 times 4 dw 32
35
                 times 4 dw 28
36
                 times 4 dw  0
37
                 times 4 dw 32
38
                 times 4 dw 16
39
                 times 4 dw 32
40
                 times 4 dw 32
41
                 times 4 dw 28
42
                 times 4 dw 32
43
                 times 4 dw 28
44
rnd_rv40_1d_tbl: times 4 dw  0
45
                 times 4 dw  2
46
                 times 4 dw  4
47
                 times 4 dw  2
48
                 times 4 dw  4
49
                 times 4 dw  3
50
                 times 4 dw  4
51
                 times 4 dw  3
52
                 times 4 dw  0
53
                 times 4 dw  4
54
                 times 4 dw  2
55
                 times 4 dw  4
56
                 times 4 dw  4
57
                 times 4 dw  3
58
                 times 4 dw  4
59
                 times 4 dw  3
60

    
61
cextern pw_3
62
cextern pw_4
63
cextern pw_8
64
cextern pw_28
65
cextern pw_32
66
cextern pw_64
67

    
68
SECTION .text
69

    
70
%macro mv0_pixels_mc8 0
71
    lea           r4, [r2*2 ]
72
.next4rows
73
    movq         mm0, [r1   ]
74
    movq         mm1, [r1+r2]
75
    CHROMAMC_AVG mm0, [r0   ]
76
    CHROMAMC_AVG mm1, [r0+r2]
77
    movq     [r0   ], mm0
78
    movq     [r0+r2], mm1
79
    add           r0, r4
80
    add           r1, r4
81
    movq         mm0, [r1   ]
82
    movq         mm1, [r1+r2]
83
    CHROMAMC_AVG mm0, [r0   ]
84
    CHROMAMC_AVG mm1, [r0+r2]
85
    add           r1, r4
86
    movq     [r0   ], mm0
87
    movq     [r0+r2], mm1
88
    add           r0, r4
89
    sub          r3d, 4
90
    jne .next4rows
91
%endmacro
92

    
93
%macro chroma_mc8_mmx_func 3
94
; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
95
;                              int stride, int h, int mx, int my)
96
cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
97
%ifdef ARCH_X86_64
98
    movsxd        r2, r2d
99
%endif
100
    mov          r6d, r5d
101
    or           r6d, r4d
102
    jne .at_least_one_non_zero
103
    ; mx == 0 AND my == 0 - no filter needed
104
    mv0_pixels_mc8
105
    REP_RET
106

    
107
.at_least_one_non_zero
108
%ifidn %2, rv40
109
%ifdef PIC
110
%define rnd_1d_rv40 r11
111
%define rnd_2d_rv40 r11
112
%else ; no-PIC
113
%define rnd_1d_rv40 rnd_rv40_1d_tbl
114
%define rnd_2d_rv40 rnd_rv40_2d_tbl
115
%endif
116
%ifdef ARCH_X86_64
117
    mov          r10, r5
118
    and          r10, 6         ; &~1 for mx/my=[0,7]
119
    lea          r10, [r10*4+r4]
120
    sar         r10d, 1
121
%define rnd_bias r10
122
%define dest_reg r0
123
%else ; x86-32
124
    mov           r0, r5
125
    and           r0, 6         ; &~1 for mx/my=[0,7]
126
    lea           r0, [r0*4+r4]
127
    sar          r0d, 1
128
%define rnd_bias r0
129
%define dest_reg r5
130
%endif
131
%else ; vc1, h264
132
%define rnd_bias  0
133
%define dest_reg r0
134
%endif
135

    
136
    test         r5d, r5d
137
    mov           r6, 1
138
    je .my_is_zero
139
    test         r4d, r4d
140
    mov           r6, r2        ; dxy = x ? 1 : stride
141
    jne .both_non_zero
142
.my_is_zero
143
    ; mx == 0 XOR my == 0 - 1 dimensional filter only
144
    or           r4d, r5d       ; x + y
145

    
146
%ifidn %2, rv40
147
%ifdef PIC
148
    lea          r11, [rnd_rv40_1d_tbl]
149
%endif
150
%ifndef ARCH_X86_64
151
    mov           r5, r0m
152
%endif
153
%endif
154

    
155
    movd          m5, r4d
156
    movq          m4, [pw_8]
157
    movq          m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
158
    punpcklwd     m5, m5
159
    punpckldq     m5, m5        ; mm5 = B = x
160
    pxor          m7, m7
161
    psubw         m4, m5        ; mm4 = A = 8-x
162

    
163
.next1drow
164
    movq          m0, [r1   ]   ; mm0 = src[0..7]
165
    movq          m2, [r1+r6]   ; mm1 = src[1..8]
166

    
167
    movq          m1, m0
168
    movq          m3, m2
169
    punpcklbw     m0, m7
170
    punpckhbw     m1, m7
171
    punpcklbw     m2, m7
172
    punpckhbw     m3, m7
173
    pmullw        m0, m4        ; [mm0,mm1] = A * src[0..7]
174
    pmullw        m1, m4
175
    pmullw        m2, m5        ; [mm2,mm3] = B * src[1..8]
176
    pmullw        m3, m5
177

    
178
    paddw         m0, m6
179
    paddw         m1, m6
180
    paddw         m0, m2
181
    paddw         m1, m3
182
    psrlw         m0, 3
183
    psrlw         m1, 3
184
    packuswb      m0, m1
185
    CHROMAMC_AVG  m0, [dest_reg]
186
    movq  [dest_reg], m0        ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
187

    
188
    add     dest_reg, r2
189
    add           r1, r2
190
    dec           r3d
191
    jne .next1drow
192
    REP_RET
193

    
194
.both_non_zero ; general case, bilinear
195
    movd          m4, r4d         ; x
196
    movd          m6, r5d         ; y
197
%ifidn %2, rv40
198
%ifdef PIC
199
    lea          r11, [rnd_rv40_2d_tbl]
200
%endif
201
%ifndef ARCH_X86_64
202
    mov           r5, r0m
203
%endif
204
%endif
205
    mov           r6, rsp         ; backup stack pointer
206
    and          rsp, ~(mmsize-1) ; align stack
207
    sub          rsp, 16          ; AA and DD
208

    
209
    punpcklwd     m4, m4
210
    punpcklwd     m6, m6
211
    punpckldq     m4, m4          ; mm4 = x words
212
    punpckldq     m6, m6          ; mm6 = y words
213
    movq          m5, m4
214
    pmullw        m4, m6          ; mm4 = x * y
215
    psllw         m5, 3
216
    psllw         m6, 3
217
    movq          m7, m5
218
    paddw         m7, m6
219
    movq     [rsp+8], m4          ; DD = x * y
220
    psubw         m5, m4          ; mm5 = B = 8x - xy
221
    psubw         m6, m4          ; mm6 = C = 8y - xy
222
    paddw         m4, [pw_64]
223
    psubw         m4, m7          ; mm4 = A = xy - (8x+8y) + 64
224
    pxor          m7, m7
225
    movq     [rsp  ], m4
226

    
227
    movq          m0, [r1  ]      ; mm0 = src[0..7]
228
    movq          m1, [r1+1]      ; mm1 = src[1..8]
229
.next2drow
230
    add           r1, r2
231

    
232
    movq          m2, m0
233
    movq          m3, m1
234
    punpckhbw     m0, m7
235
    punpcklbw     m1, m7
236
    punpcklbw     m2, m7
237
    punpckhbw     m3, m7
238
    pmullw        m0, [rsp]
239
    pmullw        m2, [rsp]
240
    pmullw        m1, m5
241
    pmullw        m3, m5
242
    paddw         m2, m1          ; mm2 = A * src[0..3] + B * src[1..4]
243
    paddw         m3, m0          ; mm3 = A * src[4..7] + B * src[5..8]
244

    
245
    movq          m0, [r1]
246
    movq          m1, m0
247
    punpcklbw     m0, m7
248
    punpckhbw     m1, m7
249
    pmullw        m0, m6
250
    pmullw        m1, m6
251
    paddw         m2, m0
252
    paddw         m3, m1          ; [mm2,mm3] += C * src[0..7]
253

    
254
    movq          m1, [r1+1]
255
    movq          m0, m1
256
    movq          m4, m1
257
    punpcklbw     m0, m7
258
    punpckhbw     m4, m7
259
    pmullw        m0, [rsp+8]
260
    pmullw        m4, [rsp+8]
261
    paddw         m2, m0
262
    paddw         m3, m4          ; [mm2,mm3] += D * src[1..8]
263
    movq          m0, [r1]
264

    
265
    paddw         m2, [rnd_2d_%2+rnd_bias*8]
266
    paddw         m3, [rnd_2d_%2+rnd_bias*8]
267
    psrlw         m2, 6
268
    psrlw         m3, 6
269
    packuswb      m2, m3
270
    CHROMAMC_AVG  m2, [dest_reg]
271
    movq  [dest_reg], m2          ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
272

    
273
    add     dest_reg, r2
274
    dec          r3d
275
    jne .next2drow
276
    mov          rsp, r6          ; restore stack pointer
277
    RET
278
%endmacro
279

    
280
%macro chroma_mc4_mmx_func 3
281
cglobal %1_%2_chroma_mc4_%3, 6, 6, 0
282
%ifdef ARCH_X86_64
283
    movsxd        r2, r2d
284
%endif
285
    pxor          m7, m7
286
    movd          m2, r4d         ; x
287
    movd          m3, r5d         ; y
288
    movq          m4, [pw_8]
289
    movq          m5, [pw_8]
290
    punpcklwd     m2, m2
291
    punpcklwd     m3, m3
292
    punpcklwd     m2, m2
293
    punpcklwd     m3, m3
294
    psubw         m4, m2
295
    psubw         m5, m3
296

    
297
%ifidn %2, rv40
298
%ifdef PIC
299
   lea           r11, [rnd_rv40_2d_tbl]
300
%define rnd_2d_rv40 r11
301
%else
302
%define rnd_2d_rv40 rnd_rv40_2d_tbl
303
%endif
304
    and           r5, 6         ; &~1 for mx/my=[0,7]
305
    lea           r5, [r5*4+r4]
306
    sar          r5d, 1
307
%define rnd_bias r5
308
%else ; vc1, h264
309
%define rnd_bias 0
310
%endif
311

    
312
    movd          m0, [r1  ]
313
    movd          m6, [r1+1]
314
    add           r1, r2
315
    punpcklbw     m0, m7
316
    punpcklbw     m6, m7
317
    pmullw        m0, m4
318
    pmullw        m6, m2
319
    paddw         m6, m0
320

    
321
.next2rows
322
    movd          m0, [r1  ]
323
    movd          m1, [r1+1]
324
    add           r1, r2
325
    punpcklbw     m0, m7
326
    punpcklbw     m1, m7
327
    pmullw        m0, m4
328
    pmullw        m1, m2
329
    paddw         m1, m0
330
    movq          m0, m1
331

    
332
    pmullw        m6, m5
333
    pmullw        m1, m3
334
    paddw         m6, [rnd_2d_%2+rnd_bias*8]
335
    paddw         m1, m6
336
    psrlw         m1, 6
337
    packuswb      m1, m1
338
    CHROMAMC_AVG4 m1, m6, [r0]
339
    movd        [r0], m1
340
    add           r0, r2
341

    
342
    movd          m6, [r1  ]
343
    movd          m1, [r1+1]
344
    add           r1, r2
345
    punpcklbw     m6, m7
346
    punpcklbw     m1, m7
347
    pmullw        m6, m4
348
    pmullw        m1, m2
349
    paddw         m1, m6
350
    movq          m6, m1
351
    pmullw        m0, m5
352
    pmullw        m1, m3
353
    paddw         m0, [rnd_2d_%2+rnd_bias*8]
354
    paddw         m1, m0
355
    psrlw         m1, 6
356
    packuswb      m1, m1
357
    CHROMAMC_AVG4 m1, m0, [r0]
358
    movd        [r0], m1
359
    add           r0, r2
360
    sub          r3d, 2
361
    jnz .next2rows
362
    REP_RET
363
%endmacro
364

    
365
%macro chroma_mc2_mmx_func 3
366
cglobal %1_%2_chroma_mc2_%3, 6, 7, 0
367
%ifdef ARCH_X86_64
368
    movsxd        r2, r2d
369
%endif
370

    
371
    mov          r6d, r4d
372
    shl          r4d, 16
373
    sub          r4d, r6d
374
    add          r4d, 8
375
    imul         r5d, r4d         ; x*y<<16 | y*(8-x)
376
    shl          r4d, 3
377
    sub          r4d, r5d         ; x*(8-y)<<16 | (8-x)*(8-y)
378

    
379
    movd          m5, r4d
380
    movd          m6, r5d
381
    punpckldq     m5, m5          ; mm5 = {A,B,A,B}
382
    punpckldq     m6, m6          ; mm6 = {C,D,C,D}
383
    pxor          m7, m7
384
    movd          m2, [r1]
385
    punpcklbw     m2, m7
386
    pshufw        m2, m2, 0x94    ; mm0 = src[0,1,1,2]
387

    
388
.nextrow
389
    add           r1, r2
390
    movq          m1, m2
391
    pmaddwd       m1, m5          ; mm1 = A * src[0,1] + B * src[1,2]
392
    movd          m0, [r1]
393
    punpcklbw     m0, m7
394
    pshufw        m0, m0, 0x94    ; mm0 = src[0,1,1,2]
395
    movq          m2, m0
396
    pmaddwd       m0, m6
397
    paddw         m1, [rnd_2d_%2]
398
    paddw         m1, m0          ; mm1 += C * src[0,1] + D * src[1,2]
399
    psrlw         m1, 6
400
    packssdw      m1, m7
401
    packuswb      m1, m7
402
    CHROMAMC_AVG4 m1, m3, [r0]
403
    movd         r5d, m1
404
    mov         [r0], r5w
405
    add           r0, r2
406
    sub          r3d, 1
407
    jnz .nextrow
408
    REP_RET
409
%endmacro
410

    
411
%define rnd_1d_h264 pw_4
412
%define rnd_2d_h264 pw_32
413
%define rnd_1d_vc1  pw_3
414
%define rnd_2d_vc1  pw_28
415

    
416
%macro NOTHING 2-3
417
%endmacro
418
%macro DIRECT_AVG 2
419
    PAVG          %1, %2
420
%endmacro
421
%macro COPY_AVG 3
422
    movd          %2, %3
423
    PAVG          %1, %2
424
%endmacro
425

    
426
INIT_MMX
427
%define CHROMAMC_AVG  NOTHING
428
%define CHROMAMC_AVG4 NOTHING
429
chroma_mc8_mmx_func put, h264, mmx_rnd
430
chroma_mc8_mmx_func put, vc1,  mmx_nornd
431
chroma_mc8_mmx_func put, rv40, mmx
432
chroma_mc4_mmx_func put, h264, mmx
433
chroma_mc4_mmx_func put, rv40, mmx
434
chroma_mc2_mmx_func put, h264, mmx2
435

    
436
%define CHROMAMC_AVG  DIRECT_AVG
437
%define CHROMAMC_AVG4 COPY_AVG
438
%define PAVG          pavgb
439
chroma_mc8_mmx_func avg, h264, mmx2_rnd
440
chroma_mc8_mmx_func avg, vc1,  mmx2_nornd
441
chroma_mc8_mmx_func avg, rv40, mmx2
442
chroma_mc4_mmx_func avg, h264, mmx2
443
chroma_mc4_mmx_func avg, rv40, mmx2
444
chroma_mc2_mmx_func avg, h264, mmx2
445

    
446
%define PAVG          pavgusb
447
chroma_mc8_mmx_func avg, h264, 3dnow_rnd
448
chroma_mc8_mmx_func avg, vc1,  3dnow_nornd
449
chroma_mc8_mmx_func avg, rv40, 3dnow
450
chroma_mc4_mmx_func avg, h264, 3dnow
451
chroma_mc4_mmx_func avg, rv40, 3dnow
452

    
453
%macro chroma_mc8_ssse3_func 3
454
cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
455
%ifdef ARCH_X86_64
456
    movsxd        r2, r2d
457
%endif
458
    mov          r6d, r5d
459
    or           r6d, r4d
460
    jne .at_least_one_non_zero
461
    ; mx == 0 AND my == 0 - no filter needed
462
    mv0_pixels_mc8
463
    REP_RET
464

    
465
.at_least_one_non_zero
466
    test         r5d, r5d
467
    je .my_is_zero
468
    test         r4d, r4d
469
    je .mx_is_zero
470

    
471
    ; general case, bilinear
472
    mov          r6d, r4d
473
    shl          r4d, 8
474
    sub           r4, r6
475
    add           r4, 8           ; x*288+8 = x<<8 | (8-x)
476
    mov           r6, 8
477
    sub          r6d, r5d
478
    imul          r6, r4          ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
479
    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
480

    
481
    movd          m7, r6d
482
    movd          m6, r4d
483
    movdqa        m5, [rnd_2d_%2]
484
    pshuflw       m7, m7, 0
485
    pshuflw       m6, m6, 0
486
    movlhps       m7, m7
487
    movlhps       m6, m6
488

    
489
    movq          m0, [r1     ]
490
    movq          m1, [r1   +1]
491
    punpcklbw     m0, m1
492
    add           r1, r2
493
.next2rows
494
    movq          m1, [r1     ]
495
    movq          m2, [r1   +1]
496
    movq          m3, [r1+r2  ]
497
    movq          m4, [r1+r2+1]
498
    lea           r1, [r1+r2*2]
499
    punpcklbw     m1, m2
500
    punpcklbw     m3, m4
501
    movdqa        m2, m1
502
    movdqa        m4, m3
503
    pmaddubsw     m0, m7
504
    pmaddubsw     m1, m6
505
    pmaddubsw     m2, m7
506
    pmaddubsw     m3, m6
507
    paddw         m0, m5
508
    paddw         m2, m5
509
    paddw         m1, m0
510
    paddw         m3, m2
511
    movdqa        m0, m4
512
    psrlw         m1, 6
513
    psrlw         m3, 6
514
%ifidn %1, avg
515
    movq          m2, [r0   ]
516
    movhps        m2, [r0+r2]
517
%endif
518
    packuswb      m1, m3
519
    CHROMAMC_AVG  m1, m2
520
    movq     [r0   ], m1
521
    movhps   [r0+r2], m1
522
    sub          r3d, 2
523
    lea           r0, [r0+r2*2]
524
    jg .next2rows
525
    REP_RET
526

    
527
.my_is_zero
528
    mov          r5d, r4d
529
    shl          r4d, 8
530
    add           r4, 8
531
    sub           r4, r5          ; 255*x+8 = x<<8 | (8-x)
532
    movd          m7, r4d
533
    movq          m6, [rnd_1d_%2]
534
    pshuflw       m7, m7, 0
535
    movlhps       m6, m6
536
    movlhps       m7, m7
537

    
538
.next2xrows
539
    movq          m0, [r1     ]
540
    movq          m1, [r1   +1]
541
    movq          m2, [r1+r2  ]
542
    movq          m3, [r1+r2+1]
543
    punpcklbw     m0, m1
544
    punpcklbw     m2, m3
545
    pmaddubsw     m0, m7
546
    pmaddubsw     m2, m7
547
%ifidn %1, avg
548
    movq          m4, [r0   ]
549
    movhps        m4, [r0+r2]
550
%endif
551
    paddw         m0, m6
552
    paddw         m2, m6
553
    psrlw         m0, 3
554
    psrlw         m2, 3
555
    packuswb      m0, m2
556
    CHROMAMC_AVG  m0, m4
557
    movq     [r0   ], m0
558
    movhps   [r0+r2], m0
559
    sub          r3d, 2
560
    lea           r0, [r0+r2*2]
561
    lea           r1, [r1+r2*2]
562
    jg .next2xrows
563
    REP_RET
564

    
565
.mx_is_zero
566
    mov          r4d, r5d
567
    shl          r5d, 8
568
    add           r5, 8
569
    sub           r5, r4          ; 255*y+8 = y<<8 | (8-y)
570
    movd          m7, r5d
571
    movq          m6, [rnd_1d_%2]
572
    pshuflw       m7, m7, 0
573
    movlhps       m6, m6
574
    movlhps       m7, m7
575

    
576
.next2yrows
577
    movq          m0, [r1     ]
578
    movq          m1, [r1+r2  ]
579
    movdqa        m2, m1
580
    movq          m3, [r1+r2*2]
581
    punpcklbw     m0, m1
582
    punpcklbw     m2, m3
583
    pmaddubsw     m0, m7
584
    pmaddubsw     m2, m7
585
%ifidn %1, avg
586
    movq          m4, [r0   ]
587
    movhps        m4, [r0+r2]
588
%endif
589
    paddw         m0, m6
590
    paddw         m2, m6
591
    psrlw         m0, 3
592
    psrlw         m2, 3
593
    packuswb      m0, m2
594
    CHROMAMC_AVG  m0, m4
595
    movq     [r0   ], m0
596
    movhps   [r0+r2], m0
597
    sub          r3d, 2
598
    lea           r0, [r0+r2*2]
599
    lea           r1, [r1+r2*2]
600
    jg .next2yrows
601
    REP_RET
602
%endmacro
603

    
604
%macro chroma_mc4_ssse3_func 3
605
cglobal %1_%2_chroma_mc4_%3, 6, 7, 0
606
%ifdef ARCH_X86_64
607
    movsxd        r2, r2d
608
%endif
609
    mov           r6, r4
610
    shl          r4d, 8
611
    sub          r4d, r6d
612
    add          r4d, 8           ; x*288+8
613
    mov           r6, 8
614
    sub          r6d, r5d
615
    imul         r6d, r4d         ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
616
    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
617

    
618
    movd          m7, r6d
619
    movd          m6, r4d
620
    movq          m5, [pw_32]
621
    pshufw        m7, m7, 0
622
    pshufw        m6, m6, 0
623

    
624
    movd          m0, [r1     ]
625
    punpcklbw     m0, [r1   +1]
626
    add           r1, r2
627
.next2rows
628
    movd          m1, [r1     ]
629
    movd          m3, [r1+r2  ]
630
    punpcklbw     m1, [r1   +1]
631
    punpcklbw     m3, [r1+r2+1]
632
    lea           r1, [r1+r2*2]
633
    movq          m2, m1
634
    movq          m4, m3
635
    pmaddubsw     m0, m7
636
    pmaddubsw     m1, m6
637
    pmaddubsw     m2, m7
638
    pmaddubsw     m3, m6
639
    paddw         m0, m5
640
    paddw         m2, m5
641
    paddw         m1, m0
642
    paddw         m3, m2
643
    movq          m0, m4
644
    psrlw         m1, 6
645
    psrlw         m3, 6
646
    packuswb      m1, m1
647
    packuswb      m3, m3
648
    CHROMAMC_AVG  m1, [r0  ]
649
    CHROMAMC_AVG  m3, [r0+r2]
650
    movd     [r0   ], m1
651
    movd     [r0+r2], m3
652
    sub          r3d, 2
653
    lea           r0, [r0+r2*2]
654
    jg .next2rows
655
    REP_RET
656
%endmacro
657

    
658
%define CHROMAMC_AVG NOTHING
659
INIT_XMM
660
chroma_mc8_ssse3_func put, h264, ssse3_rnd
661
chroma_mc8_ssse3_func put, vc1,  ssse3_nornd
662
INIT_MMX
663
chroma_mc4_ssse3_func put, h264, ssse3
664

    
665
%define CHROMAMC_AVG DIRECT_AVG
666
%define PAVG         pavgb
667
INIT_XMM
668
chroma_mc8_ssse3_func avg, h264, ssse3_rnd
669
chroma_mc8_ssse3_func avg, vc1,  ssse3_nornd
670
INIT_MMX
671
chroma_mc4_ssse3_func avg, h264, ssse3