Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_intrapred.asm @ 2e93fd4b

History | View | Annotate | Download (31.6 KB)

1
;******************************************************************************
2
;* H.264 intra prediction asm optimizations
3
;* Copyright (c) 2010 Jason Garrett-Glaser
4
;* Copyright (c) 2010 Holger Lubitz
5
;* Copyright (c) 2010 Loren Merritt
6
;* Copyright (c) 2010 Ronald S. Bultje
7
;*
8
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or
11
;* modify it under the terms of the GNU Lesser General Public
12
;* License as published by the Free Software Foundation; either
13
;* version 2.1 of the License, or (at your option) any later version.
14
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
;* Lesser General Public License for more details.
19
;*
20
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with FFmpeg; if not, write to the Free Software
22
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
;******************************************************************************
24

    
25
%include "x86inc.asm"
26
%include "x86util.asm"
27

    
28
SECTION_RODATA
29

    
30
tm_shuf: times 8 db 0x03, 0x80
31
plane_shuf:  db -8, -7, -6, -5, -4, -3, -2, -1
32
             db  1,  2,  3,  4,  5,  6,  7,  8
33
plane8_shuf: db -4, -3, -2, -1,  0,  0,  0,  0
34
             db  1,  2,  3,  4,  0,  0,  0,  0
35
pw_0to7:     dw  0,  1,  2,  3,  4,  5,  6,  7
36
pw_1to8:     dw  1,  2,  3,  4,  5,  6,  7,  8
37
pw_m8tom1:   dw -8, -7, -6, -5, -4, -3, -2, -1
38
pw_m4to4:    dw -4, -3, -2, -1,  1,  2,  3,  4
39

    
40
SECTION .text
41

    
42
cextern pb_1
43
cextern pb_3
44
cextern pw_4
45
cextern pw_5
46
cextern pw_16
47
cextern pw_17
48
cextern pw_32
49

    
50
;-----------------------------------------------------------------------------
51
; void pred16x16_vertical(uint8_t *src, int stride)
52
;-----------------------------------------------------------------------------
53

    
54
cglobal pred16x16_vertical_mmx, 2,3
55
    sub   r0, r1
56
    mov   r2, 8
57
    movq mm0, [r0+0]
58
    movq mm1, [r0+8]
59
.loop:
60
    movq [r0+r1*1+0], mm0
61
    movq [r0+r1*1+8], mm1
62
    movq [r0+r1*2+0], mm0
63
    movq [r0+r1*2+8], mm1
64
    lea   r0, [r0+r1*2]
65
    dec   r2
66
    jg .loop
67
    REP_RET
68

    
69
cglobal pred16x16_vertical_sse, 2,3
70
    sub   r0, r1
71
    mov   r2, 4
72
    movaps xmm0, [r0]
73
.loop:
74
    movaps [r0+r1*1], xmm0
75
    movaps [r0+r1*2], xmm0
76
    lea   r0, [r0+r1*2]
77
    movaps [r0+r1*1], xmm0
78
    movaps [r0+r1*2], xmm0
79
    lea   r0, [r0+r1*2]
80
    dec   r2
81
    jg .loop
82
    REP_RET
83

    
84
;-----------------------------------------------------------------------------
85
; void pred16x16_horizontal(uint8_t *src, int stride)
86
;-----------------------------------------------------------------------------
87

    
88
%macro PRED16x16_H 1
89
cglobal pred16x16_horizontal_%1, 2,3
90
    mov       r2, 8
91
%ifidn %1, ssse3
92
    mova      m2, [pb_3]
93
%endif
94
.loop:
95
    movd      m0, [r0+r1*0-4]
96
    movd      m1, [r0+r1*1-4]
97

    
98
%ifidn %1, ssse3
99
    pshufb    m0, m2
100
    pshufb    m1, m2
101
%else
102
    punpcklbw m0, m0
103
    punpcklbw m1, m1
104
%ifidn %1, mmxext
105
    pshufw    m0, m0, 0xff
106
    pshufw    m1, m1, 0xff
107
%else
108
    punpckhwd m0, m0
109
    punpckhwd m1, m1
110
    punpckhdq m0, m0
111
    punpckhdq m1, m1
112
%endif
113
    mova [r0+r1*0+8], m0
114
    mova [r0+r1*1+8], m1
115
%endif
116

    
117
    mova [r0+r1*0], m0
118
    mova [r0+r1*1], m1
119
    lea       r0, [r0+r1*2]
120
    dec       r2
121
    jg .loop
122
    REP_RET
123
%endmacro
124

    
125
INIT_MMX
126
PRED16x16_H mmx
127
PRED16x16_H mmxext
128
INIT_XMM
129
PRED16x16_H ssse3
130

    
131
;-----------------------------------------------------------------------------
132
; void pred16x16_dc(uint8_t *src, int stride)
133
;-----------------------------------------------------------------------------
134

    
135
%macro PRED16x16_DC 1
136
cglobal pred16x16_dc_%1, 2,7
137
    mov       r4, r0
138
    sub       r0, r1
139
    pxor      mm0, mm0
140
    pxor      mm1, mm1
141
    psadbw    mm0, [r0+0]
142
    psadbw    mm1, [r0+8]
143
    dec        r0
144
    movzx     r5d, byte [r0+r1*1]
145
    paddw     mm0, mm1
146
    movd      r6d, mm0
147
    lea        r0, [r0+r1*2]
148
%rep 7
149
    movzx     r2d, byte [r0+r1*0]
150
    movzx     r3d, byte [r0+r1*1]
151
    add       r5d, r2d
152
    add       r6d, r3d
153
    lea        r0, [r0+r1*2]
154
%endrep
155
    movzx     r2d, byte [r0+r1*0]
156
    add       r5d, r6d
157
    lea       r2d, [r2+r5+16]
158
    shr       r2d, 5
159
%ifidn %1, mmxext
160
    movd       m0, r2d
161
    punpcklbw  m0, m0
162
    pshufw     m0, m0, 0
163
%elifidn %1, sse2
164
    movd       m0, r2d
165
    punpcklbw  m0, m0
166
    pshuflw    m0, m0, 0
167
    punpcklqdq m0, m0
168
%elifidn %1, ssse3
169
    pxor       m1, m1
170
    movd       m0, r2d
171
    pshufb     m0, m1
172
%endif
173

    
174
%if mmsize==8
175
    mov       r3d, 8
176
.loop:
177
    mova [r4+r1*0+0], m0
178
    mova [r4+r1*0+8], m0
179
    mova [r4+r1*1+0], m0
180
    mova [r4+r1*1+8], m0
181
%else
182
    mov       r3d, 4
183
.loop:
184
    mova [r4+r1*0], m0
185
    mova [r4+r1*1], m0
186
    lea   r4, [r4+r1*2]
187
    mova [r4+r1*0], m0
188
    mova [r4+r1*1], m0
189
%endif
190
    lea   r4, [r4+r1*2]
191
    dec   r3d
192
    jg .loop
193
    REP_RET
194
%endmacro
195

    
196
INIT_MMX
197
PRED16x16_DC mmxext
198
INIT_XMM
199
PRED16x16_DC   sse2
200
PRED16x16_DC  ssse3
201

    
202
;-----------------------------------------------------------------------------
203
; void pred16x16_tm_vp8(uint8_t *src, int stride)
204
;-----------------------------------------------------------------------------
205

    
206
%macro PRED16x16_TM_MMX 1
207
cglobal pred16x16_tm_vp8_%1, 2,5
208
    sub        r0, r1
209
    pxor      mm7, mm7
210
    movq      mm0, [r0+0]
211
    movq      mm2, [r0+8]
212
    movq      mm1, mm0
213
    movq      mm3, mm2
214
    punpcklbw mm0, mm7
215
    punpckhbw mm1, mm7
216
    punpcklbw mm2, mm7
217
    punpckhbw mm3, mm7
218
    movzx     r3d, byte [r0-1]
219
    mov       r4d, 16
220
.loop:
221
    movzx     r2d, byte [r0+r1-1]
222
    sub       r2d, r3d
223
    movd      mm4, r2d
224
%ifidn %1, mmx
225
    punpcklwd mm4, mm4
226
    punpckldq mm4, mm4
227
%else
228
    pshufw    mm4, mm4, 0
229
%endif
230
    movq      mm5, mm4
231
    movq      mm6, mm4
232
    movq      mm7, mm4
233
    paddw     mm4, mm0
234
    paddw     mm5, mm1
235
    paddw     mm6, mm2
236
    paddw     mm7, mm3
237
    packuswb  mm4, mm5
238
    packuswb  mm6, mm7
239
    movq [r0+r1+0], mm4
240
    movq [r0+r1+8], mm6
241
    add        r0, r1
242
    dec       r4d
243
    jg .loop
244
    REP_RET
245
%endmacro
246

    
247
PRED16x16_TM_MMX mmx
248
PRED16x16_TM_MMX mmxext
249

    
250
cglobal pred16x16_tm_vp8_sse2, 2,6,6
251
    sub          r0, r1
252
    pxor       xmm2, xmm2
253
    movdqa     xmm0, [r0]
254
    movdqa     xmm1, xmm0
255
    punpcklbw  xmm0, xmm2
256
    punpckhbw  xmm1, xmm2
257
    movzx       r4d, byte [r0-1]
258
    mov         r5d, 8
259
.loop:
260
    movzx       r2d, byte [r0+r1*1-1]
261
    movzx       r3d, byte [r0+r1*2-1]
262
    sub         r2d, r4d
263
    sub         r3d, r4d
264
    movd       xmm2, r2d
265
    movd       xmm4, r3d
266
    pshuflw    xmm2, xmm2, 0
267
    pshuflw    xmm4, xmm4, 0
268
    punpcklqdq xmm2, xmm2
269
    punpcklqdq xmm4, xmm4
270
    movdqa     xmm3, xmm2
271
    movdqa     xmm5, xmm4
272
    paddw      xmm2, xmm0
273
    paddw      xmm3, xmm1
274
    paddw      xmm4, xmm0
275
    paddw      xmm5, xmm1
276
    packuswb   xmm2, xmm3
277
    packuswb   xmm4, xmm5
278
    movdqa [r0+r1*1], xmm2
279
    movdqa [r0+r1*2], xmm4
280
    lea          r0, [r0+r1*2]
281
    dec         r5d
282
    jg .loop
283
    REP_RET
284

    
285
;-----------------------------------------------------------------------------
286
; void pred16x16_plane(uint8_t *src, int stride)
287
;-----------------------------------------------------------------------------
288

    
289
%macro H264_PRED16x16_PLANE 3
290
cglobal pred16x16_plane_%3_%1, 2, 7, %2
291
    mov          r2, r1           ; +stride
292
    neg          r1               ; -stride
293

    
294
    movh         m0, [r0+r1  -1]
295
%if mmsize == 8
296
    pxor         m4, m4
297
    movh         m1, [r0+r1  +3 ]
298
    movh         m2, [r0+r1  +8 ]
299
    movh         m3, [r0+r1  +12]
300
    punpcklbw    m0, m4
301
    punpcklbw    m1, m4
302
    punpcklbw    m2, m4
303
    punpcklbw    m3, m4
304
    pmullw       m0, [pw_m8tom1  ]
305
    pmullw       m1, [pw_m8tom1+8]
306
    pmullw       m2, [pw_1to8    ]
307
    pmullw       m3, [pw_1to8  +8]
308
    paddw        m0, m2
309
    paddw        m1, m3
310
%else ; mmsize == 16
311
%ifidn %1, sse2
312
    pxor         m2, m2
313
    movh         m1, [r0+r1  +8]
314
    punpcklbw    m0, m2
315
    punpcklbw    m1, m2
316
    pmullw       m0, [pw_m8tom1]
317
    pmullw       m1, [pw_1to8]
318
    paddw        m0, m1
319
%else ; ssse3
320
    movhps       m0, [r0+r1  +8]
321
    pmaddubsw    m0, [plane_shuf] ; H coefficients
322
%endif
323
    movhlps      m1, m0
324
%endif
325
    paddw        m0, m1
326
%ifidn %1, mmx
327
    mova         m1, m0
328
    psrlq        m1, 32
329
%elifidn %1, mmx2
330
    pshufw       m1, m0, 0xE
331
%else ; mmsize == 16
332
    pshuflw      m1, m0, 0xE
333
%endif
334
    paddw        m0, m1
335
%ifidn %1, mmx
336
    mova         m1, m0
337
    psrlq        m1, 16
338
%elifidn %1, mmx2
339
    pshufw       m1, m0, 0x1
340
%else
341
    pshuflw      m1, m0, 0x1
342
%endif
343
    paddw        m0, m1           ; sum of H coefficients
344

    
345
%ifidn %3, h264
346
    pmullw       m0, [pw_5]
347
    paddw        m0, [pw_32]
348
    psraw        m0, 6
349
%elifidn %3, rv40
350
    pmullw       m0, [pw_5]
351
    psraw        m0, 6
352
%elifidn %3, svq3
353
    movd        r3d, m0
354
    movsx        r3, r3w
355
    test         r3, r3
356
    lea          r4, [r3+3]
357
    cmovs        r3, r4
358
    sar          r3, 2           ; H/4
359
    lea          r3, [r3*5]      ; 5*(H/4)
360
    test         r3, r3
361
    lea          r4, [r3+15]
362
    cmovs        r3, r4
363
    sar          r3, 4           ; (5*(H/4))/16
364
    movd         m0, r3d
365
%endif
366

    
367
    lea          r4, [r0+r2*8-1]
368
    lea          r3, [r0+r2*4-1]
369
    add          r4, r2
370

    
371
%ifdef ARCH_X86_64
372
%define e_reg r11
373
%else
374
%define e_reg r0
375
%endif
376

    
377
    movzx     e_reg, byte [r3+r2*2   ]
378
    movzx        r5, byte [r4+r1     ]
379
    sub          r5, e_reg
380

    
381
    movzx     e_reg, byte [r3+r2     ]
382
    movzx        r6, byte [r4        ]
383
    sub          r6, e_reg
384
    lea          r5, [r5+r6*2]
385

    
386
    movzx     e_reg, byte [r3+r1     ]
387
    movzx        r6, byte [r4+r2*2   ]
388
    sub          r6, e_reg
389
    lea          r5, [r5+r6*4]
390

    
391
    movzx     e_reg, byte [r3        ]
392
%ifdef ARCH_X86_64
393
    movzx       r10, byte [r4+r2     ]
394
    sub         r10, e_reg
395
%else
396
    movzx        r6, byte [r4+r2     ]
397
    sub          r6, e_reg
398
    lea          r5, [r5+r6*4]
399
    sub          r5, r6
400
%endif
401

    
402
    lea       e_reg, [r3+r1*4]
403
    lea          r3, [r4+r2*4]
404

    
405
    movzx        r4, byte [e_reg+r2  ]
406
    movzx        r6, byte [r3        ]
407
    sub          r6, r4
408
%ifdef ARCH_X86_64
409
    lea          r6, [r10+r6*2]
410
    lea          r5, [r5+r6*2]
411
    add          r5, r6
412
%else
413
    lea          r5, [r5+r6*4]
414
    lea          r5, [r5+r6*2]
415
%endif
416

    
417
    movzx        r4, byte [e_reg     ]
418
%ifdef ARCH_X86_64
419
    movzx       r10, byte [r3   +r2  ]
420
    sub         r10, r4
421
    sub          r5, r10
422
%else
423
    movzx        r6, byte [r3   +r2  ]
424
    sub          r6, r4
425
    lea          r5, [r5+r6*8]
426
    sub          r5, r6
427
%endif
428

    
429
    movzx        r4, byte [e_reg+r1  ]
430
    movzx        r6, byte [r3   +r2*2]
431
    sub          r6, r4
432
%ifdef ARCH_X86_64
433
    add          r6, r10
434
%endif
435
    lea          r5, [r5+r6*8]
436

    
437
    movzx        r4, byte [e_reg+r2*2]
438
    movzx        r6, byte [r3   +r1  ]
439
    sub          r6, r4
440
    lea          r5, [r5+r6*4]
441
    add          r5, r6           ; sum of V coefficients
442

    
443
%ifndef ARCH_X86_64
444
    mov          r0, r0m
445
%endif
446

    
447
%ifidn %3, h264
448
    lea          r5, [r5*5+32]
449
    sar          r5, 6
450
%elifidn %3, rv40
451
    lea          r5, [r5*5]
452
    sar          r5, 6
453
%elifidn %3, svq3
454
    test         r5, r5
455
    lea          r6, [r5+3]
456
    cmovs        r5, r6
457
    sar          r5, 2            ; V/4
458
    lea          r5, [r5*5]       ; 5*(V/4)
459
    test         r5, r5
460
    lea          r6, [r5+15]
461
    cmovs        r5, r6
462
    sar          r5, 4            ; (5*(V/4))/16
463
%endif
464

    
465
    movzx        r4, byte [r0+r1  +15]
466
    movzx        r3, byte [r3+r2*2   ]
467
    lea          r3, [r3+r4+1]
468
    shl          r3, 4
469
    movd        r1d, m0
470
    movsx       r1d, r1w
471
    add         r1d, r5d
472
    add         r3d, r1d
473
    shl         r1d, 3
474
    sub         r3d, r1d          ; a
475

    
476
    movd         m1, r5d
477
    movd         m3, r3d
478
%ifidn %1, mmx
479
    punpcklwd    m0, m0
480
    punpcklwd    m1, m1
481
    punpcklwd    m3, m3
482
    punpckldq    m0, m0
483
    punpckldq    m1, m1
484
    punpckldq    m3, m3
485
%elifidn %1, mmx2
486
    pshufw       m0, m0, 0x0
487
    pshufw       m1, m1, 0x0
488
    pshufw       m3, m3, 0x0
489
%else
490
    pshuflw      m0, m0, 0x0
491
    pshuflw      m1, m1, 0x0
492
    pshuflw      m3, m3, 0x0
493
    punpcklqdq   m0, m0           ; splat H (words)
494
    punpcklqdq   m1, m1           ; splat V (words)
495
    punpcklqdq   m3, m3           ; splat a (words)
496
%endif
497
%ifidn %3, svq3
498
    SWAP          0, 1
499
%endif
500
    mova         m2, m0
501
%if mmsize == 8
502
    mova         m5, m0
503
%endif
504
    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
505
%if mmsize == 16
506
    psllw        m2, 3
507
%else
508
    psllw        m5, 3
509
    psllw        m2, 2
510
    mova         m6, m5
511
    paddw        m6, m2
512
%endif
513
    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
514
    paddw        m2, m0           ; a + {8,9,10,11,12,13,14,15}*H
515
%if mmsize == 8
516
    paddw        m5, m0           ; a + {8,9,10,11}*H
517
    paddw        m6, m0           ; a + {12,13,14,15}*H
518
%endif
519

    
520
    mov          r4, 8
521
.loop
522
    mova         m3, m0           ; b[0..7]
523
    mova         m4, m2           ; b[8..15]
524
    psraw        m3, 5
525
    psraw        m4, 5
526
    packuswb     m3, m4
527
    mova       [r0], m3
528
%if mmsize == 8
529
    mova         m3, m5           ; b[8..11]
530
    mova         m4, m6           ; b[12..15]
531
    psraw        m3, 5
532
    psraw        m4, 5
533
    packuswb     m3, m4
534
    mova     [r0+8], m3
535
%endif
536
    paddw        m0, m1
537
    paddw        m2, m1
538
%if mmsize == 8
539
    paddw        m5, m1
540
    paddw        m6, m1
541
%endif
542

    
543
    mova         m3, m0           ; b[0..7]
544
    mova         m4, m2           ; b[8..15]
545
    psraw        m3, 5
546
    psraw        m4, 5
547
    packuswb     m3, m4
548
    mova    [r0+r2], m3
549
%if mmsize == 8
550
    mova         m3, m5           ; b[8..11]
551
    mova         m4, m6           ; b[12..15]
552
    psraw        m3, 5
553
    psraw        m4, 5
554
    packuswb     m3, m4
555
    mova  [r0+r2+8], m3
556
%endif
557
    paddw        m0, m1
558
    paddw        m2, m1
559
%if mmsize == 8
560
    paddw        m5, m1
561
    paddw        m6, m1
562
%endif
563

    
564
    lea          r0, [r0+r2*2]
565
    dec          r4
566
    jg .loop
567
    REP_RET
568
%endmacro
569

    
570
INIT_MMX
571
H264_PRED16x16_PLANE mmx,   0, h264
572
H264_PRED16x16_PLANE mmx,   0, rv40
573
H264_PRED16x16_PLANE mmx,   0, svq3
574
H264_PRED16x16_PLANE mmx2,  0, h264
575
H264_PRED16x16_PLANE mmx2,  0, rv40
576
H264_PRED16x16_PLANE mmx2,  0, svq3
577
INIT_XMM
578
H264_PRED16x16_PLANE sse2,  8, h264
579
H264_PRED16x16_PLANE sse2,  8, rv40
580
H264_PRED16x16_PLANE sse2,  8, svq3
581
H264_PRED16x16_PLANE ssse3, 8, h264
582
H264_PRED16x16_PLANE ssse3, 8, rv40
583
H264_PRED16x16_PLANE ssse3, 8, svq3
584

    
585
;-----------------------------------------------------------------------------
586
; void pred8x8_plane(uint8_t *src, int stride)
587
;-----------------------------------------------------------------------------
588

    
589
%macro H264_PRED8x8_PLANE 2
590
cglobal pred8x8_plane_%1, 2, 7, %2
591
    mov          r2, r1           ; +stride
592
    neg          r1               ; -stride
593

    
594
    movd         m0, [r0+r1  -1]
595
%if mmsize == 8
596
    pxor         m2, m2
597
    movh         m1, [r0+r1  +4 ]
598
    punpcklbw    m0, m2
599
    punpcklbw    m1, m2
600
    pmullw       m0, [pw_m4to4]
601
    pmullw       m1, [pw_m4to4+8]
602
%else ; mmsize == 16
603
%ifidn %1, sse2
604
    pxor         m2, m2
605
    movd         m1, [r0+r1  +4]
606
    punpckldq    m0, m1
607
    punpcklbw    m0, m2
608
    pmullw       m0, [pw_m4to4]
609
%else ; ssse3
610
    movhps       m0, [r0+r1  +4]   ; this reads 4 bytes more than necessary
611
    pmaddubsw    m0, [plane8_shuf] ; H coefficients
612
%endif
613
    movhlps      m1, m0
614
%endif
615
    paddw        m0, m1
616

    
617
%ifnidn %1, ssse3
618
%ifidn %1, mmx
619
    mova         m1, m0
620
    psrlq        m1, 32
621
%elifidn %1, mmx2
622
    pshufw       m1, m0, 0xE
623
%else ; mmsize == 16
624
    pshuflw      m1, m0, 0xE
625
%endif
626
    paddw        m0, m1
627
%endif ; !ssse3
628

    
629
%ifidn %1, mmx
630
    mova         m1, m0
631
    psrlq        m1, 16
632
%elifidn %1, mmx2
633
    pshufw       m1, m0, 0x1
634
%else
635
    pshuflw      m1, m0, 0x1
636
%endif
637
    paddw        m0, m1           ; sum of H coefficients
638

    
639
    pmullw       m0, [pw_17]
640
    paddw        m0, [pw_16]
641
    psraw        m0, 5
642

    
643
    lea          r4, [r0+r2*4-1]
644
    lea          r3, [r0     -1]
645
    add          r4, r2
646

    
647
%ifdef ARCH_X86_64
648
%define e_reg r11
649
%else
650
%define e_reg r0
651
%endif
652

    
653
    movzx     e_reg, byte [r3+r2*2   ]
654
    movzx        r5, byte [r4+r1     ]
655
    sub          r5, e_reg
656

    
657
    movzx     e_reg, byte [r3        ]
658
%ifdef ARCH_X86_64
659
    movzx       r10, byte [r4+r2     ]
660
    sub         r10, e_reg
661
    sub          r5, r10
662
%else
663
    movzx        r6, byte [r4+r2     ]
664
    sub          r6, e_reg
665
    lea          r5, [r5+r6*4]
666
    sub          r5, r6
667
%endif
668

    
669
    movzx     e_reg, byte [r3+r1     ]
670
    movzx        r6, byte [r4+r2*2   ]
671
    sub          r6, e_reg
672
%ifdef ARCH_X86_64
673
    add          r6, r10
674
%endif
675
    lea          r5, [r5+r6*4]
676

    
677
    movzx     e_reg, byte [r3+r2     ]
678
    movzx        r6, byte [r4        ]
679
    sub          r6, e_reg
680
    lea          r6, [r5+r6*2]
681

    
682
    lea          r5, [r6*9+16]
683
    lea          r5, [r5+r6*8]
684
    sar          r5, 5
685

    
686
%ifndef ARCH_X86_64
687
    mov          r0, r0m
688
%endif
689

    
690
    movzx        r3, byte [r4+r2*2  ]
691
    movzx        r4, byte [r0+r1  +7]
692
    lea          r3, [r3+r4+1]
693
    shl          r3, 4
694
    movd        r1d, m0
695
    movsx       r1d, r1w
696
    add         r1d, r5d
697
    sub         r3d, r1d
698
    add         r1d, r1d
699
    sub         r3d, r1d          ; a
700

    
701
    movd         m1, r5d
702
    movd         m3, r3d
703
%ifidn %1, mmx
704
    punpcklwd    m0, m0
705
    punpcklwd    m1, m1
706
    punpcklwd    m3, m3
707
    punpckldq    m0, m0
708
    punpckldq    m1, m1
709
    punpckldq    m3, m3
710
%elifidn %1, mmx2
711
    pshufw       m0, m0, 0x0
712
    pshufw       m1, m1, 0x0
713
    pshufw       m3, m3, 0x0
714
%else
715
    pshuflw      m0, m0, 0x0
716
    pshuflw      m1, m1, 0x0
717
    pshuflw      m3, m3, 0x0
718
    punpcklqdq   m0, m0           ; splat H (words)
719
    punpcklqdq   m1, m1           ; splat V (words)
720
    punpcklqdq   m3, m3           ; splat a (words)
721
%endif
722
%if mmsize == 8
723
    mova         m2, m0
724
%endif
725
    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
726
    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
727
%if mmsize == 8
728
    psllw        m2, 2
729
    paddw        m2, m0           ; a + {4,5,6,7}*H
730
%endif
731

    
732
    mov          r4, 4
733
ALIGN 16
734
.loop
735
%if mmsize == 16
736
    mova         m3, m0           ; b[0..7]
737
    paddw        m0, m1
738
    psraw        m3, 5
739
    mova         m4, m0           ; V+b[0..7]
740
    paddw        m0, m1
741
    psraw        m4, 5
742
    packuswb     m3, m4
743
    movh       [r0], m3
744
    movhps  [r0+r2], m3
745
%else ; mmsize == 8
746
    mova         m3, m0           ; b[0..3]
747
    mova         m4, m2           ; b[4..7]
748
    paddw        m0, m1
749
    paddw        m2, m1
750
    psraw        m3, 5
751
    psraw        m4, 5
752
    mova         m5, m0           ; V+b[0..3]
753
    mova         m6, m2           ; V+b[4..7]
754
    paddw        m0, m1
755
    paddw        m2, m1
756
    psraw        m5, 5
757
    psraw        m6, 5
758
    packuswb     m3, m4
759
    packuswb     m5, m6
760
    mova       [r0], m3
761
    mova    [r0+r2], m5
762
%endif
763

    
764
    lea          r0, [r0+r2*2]
765
    dec          r4
766
    jg .loop
767
    REP_RET
768
%endmacro
769

    
770
INIT_MMX
771
H264_PRED8x8_PLANE mmx,   0
772
H264_PRED8x8_PLANE mmx2,  0
773
INIT_XMM
774
H264_PRED8x8_PLANE sse2,  8
775
H264_PRED8x8_PLANE ssse3, 8
776

    
777
;-----------------------------------------------------------------------------
778
; void pred8x8_vertical(uint8_t *src, int stride)
779
;-----------------------------------------------------------------------------
780

    
781
cglobal pred8x8_vertical_mmx, 2,2
782
    sub    r0, r1
783
    movq  mm0, [r0]
784
%rep 3
785
    movq [r0+r1*1], mm0
786
    movq [r0+r1*2], mm0
787
    lea    r0, [r0+r1*2]
788
%endrep
789
    movq [r0+r1*1], mm0
790
    movq [r0+r1*2], mm0
791
    RET
792

    
793
;-----------------------------------------------------------------------------
794
; void pred8x8_horizontal(uint8_t *src, int stride)
795
;-----------------------------------------------------------------------------
796

    
797
%macro PRED8x8_H 1
798
cglobal pred8x8_horizontal_%1, 2,3
799
    mov       r2, 4
800
%ifidn %1, ssse3
801
    mova      m2, [pb_3]
802
%endif
803
.loop:
804
    movd      m0, [r0+r1*0-4]
805
    movd      m1, [r0+r1*1-4]
806
%ifidn %1, ssse3
807
    pshufb    m0, m2
808
    pshufb    m1, m2
809
%else
810
    punpcklbw m0, m0
811
    punpcklbw m1, m1
812
%ifidn %1, mmxext
813
    pshufw    m0, m0, 0xff
814
    pshufw    m1, m1, 0xff
815
%else
816
    punpckhwd m0, m0
817
    punpckhwd m1, m1
818
    punpckhdq m0, m0
819
    punpckhdq m1, m1
820
%endif
821
%endif
822
    mova [r0+r1*0], m0
823
    mova [r0+r1*1], m1
824
    lea       r0, [r0+r1*2]
825
    dec       r2
826
    jg .loop
827
    REP_RET
828
%endmacro
829

    
830
INIT_MMX
831
PRED8x8_H mmx
832
PRED8x8_H mmxext
833
PRED8x8_H ssse3
834

    
835
;-----------------------------------------------------------------------------
836
; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
837
;-----------------------------------------------------------------------------
838
%ifdef CONFIG_GPL
839
cglobal pred8x8_top_dc_mmxext, 2,5
840
    sub         r0, r1
841
    movq       mm0, [r0]
842
    pxor       mm1, mm1
843
    pxor       mm2, mm2
844
    lea         r2, [r0+r1*2]
845
    punpckhbw  mm1, mm0
846
    punpcklbw  mm0, mm2
847
    psadbw     mm1, mm2        ; s1
848
    lea         r3, [r2+r1*2]
849
    psadbw     mm0, mm2        ; s0
850
    psrlw      mm1, 1
851
    psrlw      mm0, 1
852
    pavgw      mm1, mm2
853
    lea         r4, [r3+r1*2]
854
    pavgw      mm0, mm2
855
    pshufw     mm1, mm1, 0
856
    pshufw     mm0, mm0, 0     ; dc0 (w)
857
    packuswb   mm0, mm1        ; dc0,dc1 (b)
858
    movq [r0+r1*1], mm0
859
    movq [r0+r1*2], mm0
860
    lea         r0, [r3+r1*2]
861
    movq [r2+r1*1], mm0
862
    movq [r2+r1*2], mm0
863
    movq [r3+r1*1], mm0
864
    movq [r3+r1*2], mm0
865
    movq [r0+r1*1], mm0
866
    movq [r0+r1*2], mm0
867
    RET
868
%endif
869

    
870
;-----------------------------------------------------------------------------
871
; void pred8x8_dc_mmxext(uint8_t *src, int stride)
872
;-----------------------------------------------------------------------------
873
%ifdef CONFIG_GPL
874
INIT_MMX
875
cglobal pred8x8_dc_mmxext, 2,5
876
    sub       r0, r1
877
    pxor      m7, m7
878
    movd      m0, [r0+0]
879
    movd      m1, [r0+4]
880
    psadbw    m0, m7            ; s0
881
    mov       r4, r0
882
    psadbw    m1, m7            ; s1
883

    
884
    movzx    r2d, byte [r0+r1*1-1]
885
    movzx    r3d, byte [r0+r1*2-1]
886
    lea       r0, [r0+r1*2]
887
    add      r2d, r3d
888
    movzx    r3d, byte [r0+r1*1-1]
889
    add      r2d, r3d
890
    movzx    r3d, byte [r0+r1*2-1]
891
    add      r2d, r3d
892
    lea       r0, [r0+r1*2]
893
    movd      m2, r2d            ; s2
894
    movzx    r2d, byte [r0+r1*1-1]
895
    movzx    r3d, byte [r0+r1*2-1]
896
    lea       r0, [r0+r1*2]
897
    add      r2d, r3d
898
    movzx    r3d, byte [r0+r1*1-1]
899
    add      r2d, r3d
900
    movzx    r3d, byte [r0+r1*2-1]
901
    add      r2d, r3d
902
    movd      m3, r2d            ; s3
903

    
904
    punpcklwd m0, m1
905
    mov       r0, r4
906
    punpcklwd m2, m3
907
    punpckldq m0, m2            ; s0, s1, s2, s3
908
    pshufw    m3, m0, 11110110b ; s2, s1, s3, s3
909
    lea       r2, [r0+r1*2]
910
    pshufw    m0, m0, 01110100b ; s0, s1, s3, s1
911
    paddw     m0, m3
912
    lea       r3, [r2+r1*2]
913
    psrlw     m0, 2
914
    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
915
    lea       r4, [r3+r1*2]
916
    packuswb  m0, m0
917
    punpcklbw m0, m0
918
    movq      m1, m0
919
    punpcklbw m0, m0
920
    punpckhbw m1, m1
921
    movq [r0+r1*1], m0
922
    movq [r0+r1*2], m0
923
    movq [r2+r1*1], m0
924
    movq [r2+r1*2], m0
925
    movq [r3+r1*1], m1
926
    movq [r3+r1*2], m1
927
    movq [r4+r1*1], m1
928
    movq [r4+r1*2], m1
929
    RET
930
%endif
931

    
932
;-----------------------------------------------------------------------------
933
; void pred8x8_dc_rv40(uint8_t *src, int stride)
934
;-----------------------------------------------------------------------------
935

    
936
cglobal pred8x8_dc_rv40_mmxext, 2,7
937
    mov       r4, r0
938
    sub       r0, r1
939
    pxor      mm0, mm0
940
    psadbw    mm0, [r0]
941
    dec        r0
942
    movzx     r5d, byte [r0+r1*1]
943
    movd      r6d, mm0
944
    lea        r0, [r0+r1*2]
945
%rep 3
946
    movzx     r2d, byte [r0+r1*0]
947
    movzx     r3d, byte [r0+r1*1]
948
    add       r5d, r2d
949
    add       r6d, r3d
950
    lea        r0, [r0+r1*2]
951
%endrep
952
    movzx     r2d, byte [r0+r1*0]
953
    add       r5d, r6d
954
    lea       r2d, [r2+r5+8]
955
    shr       r2d, 4
956
    movd      mm0, r2d
957
    punpcklbw mm0, mm0
958
    pshufw    mm0, mm0, 0
959
    mov       r3d, 4
960
.loop:
961
    movq [r4+r1*0], mm0
962
    movq [r4+r1*1], mm0
963
    lea   r4, [r4+r1*2]
964
    dec   r3d
965
    jg .loop
966
    REP_RET
967

    
968
;-----------------------------------------------------------------------------
969
; void pred8x8_tm_vp8(uint8_t *src, int stride)
970
;-----------------------------------------------------------------------------
971

    
972
%macro PRED8x8_TM_MMX 1
973
cglobal pred8x8_tm_vp8_%1, 2,6
974
    sub        r0, r1
975
    pxor      mm7, mm7
976
    movq      mm0, [r0]
977
    movq      mm1, mm0
978
    punpcklbw mm0, mm7
979
    punpckhbw mm1, mm7
980
    movzx     r4d, byte [r0-1]
981
    mov       r5d, 4
982
.loop:
983
    movzx     r2d, byte [r0+r1*1-1]
984
    movzx     r3d, byte [r0+r1*2-1]
985
    sub       r2d, r4d
986
    sub       r3d, r4d
987
    movd      mm2, r2d
988
    movd      mm4, r3d
989
%ifidn %1, mmx
990
    punpcklwd mm2, mm2
991
    punpcklwd mm4, mm4
992
    punpckldq mm2, mm2
993
    punpckldq mm4, mm4
994
%else
995
    pshufw    mm2, mm2, 0
996
    pshufw    mm4, mm4, 0
997
%endif
998
    movq      mm3, mm2
999
    movq      mm5, mm4
1000
    paddw     mm2, mm0
1001
    paddw     mm3, mm1
1002
    paddw     mm4, mm0
1003
    paddw     mm5, mm1
1004
    packuswb  mm2, mm3
1005
    packuswb  mm4, mm5
1006
    movq [r0+r1*1], mm2
1007
    movq [r0+r1*2], mm4
1008
    lea        r0, [r0+r1*2]
1009
    dec       r5d
1010
    jg .loop
1011
    REP_RET
1012
%endmacro
1013

    
1014
PRED8x8_TM_MMX mmx
1015
PRED8x8_TM_MMX mmxext
1016

    
1017
cglobal pred8x8_tm_vp8_sse2, 2,6,4
1018
    sub          r0, r1
1019
    pxor       xmm1, xmm1
1020
    movq       xmm0, [r0]
1021
    punpcklbw  xmm0, xmm1
1022
    movzx       r4d, byte [r0-1]
1023
    mov         r5d, 4
1024
.loop:
1025
    movzx       r2d, byte [r0+r1*1-1]
1026
    movzx       r3d, byte [r0+r1*2-1]
1027
    sub         r2d, r4d
1028
    sub         r3d, r4d
1029
    movd       xmm2, r2d
1030
    movd       xmm3, r3d
1031
    pshuflw    xmm2, xmm2, 0
1032
    pshuflw    xmm3, xmm3, 0
1033
    punpcklqdq xmm2, xmm2
1034
    punpcklqdq xmm3, xmm3
1035
    paddw      xmm2, xmm0
1036
    paddw      xmm3, xmm0
1037
    packuswb   xmm2, xmm3
1038
    movq   [r0+r1*1], xmm2
1039
    movhps [r0+r1*2], xmm2
1040
    lea          r0, [r0+r1*2]
1041
    dec         r5d
1042
    jg .loop
1043
    REP_RET
1044

    
1045
cglobal pred8x8_tm_vp8_ssse3, 2,3,6
1046
    sub          r0, r1
1047
    movdqa     xmm4, [tm_shuf]
1048
    pxor       xmm1, xmm1
1049
    movq       xmm0, [r0]
1050
    punpcklbw  xmm0, xmm1
1051
    movd       xmm5, [r0-4]
1052
    pshufb     xmm5, xmm4
1053
    mov         r2d, 4
1054
.loop:
1055
    movd       xmm2, [r0+r1*1-4]
1056
    movd       xmm3, [r0+r1*2-4]
1057
    pshufb     xmm2, xmm4
1058
    pshufb     xmm3, xmm4
1059
    psubw      xmm2, xmm5
1060
    psubw      xmm3, xmm5
1061
    paddw      xmm2, xmm0
1062
    paddw      xmm3, xmm0
1063
    packuswb   xmm2, xmm3
1064
    movq   [r0+r1*1], xmm2
1065
    movhps [r0+r1*2], xmm2
1066
    lea          r0, [r0+r1*2]
1067
    dec         r2d
1068
    jg .loop
1069
    REP_RET
1070

    
1071
; dest, left, right, src, tmp
1072
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1073
%macro PRED4x4_LOWPASS 5
1074
    mova    %5, %2
1075
    pavgb   %2, %3
1076
    pxor    %3, %5
1077
    mova    %1, %4
1078
    pand    %3, [pb_1]
1079
    psubusb %2, %3
1080
    pavgb   %1, %2
1081
%endmacro
1082

    
1083
;-----------------------------------------------------------------------------
1084
; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1085
;-----------------------------------------------------------------------------
1086
%ifdef CONFIG_GPL
1087
%macro PRED8x8L_TOP_DC 1
1088
cglobal pred8x8l_top_dc_%1, 4,4
1089
    sub          r0, r3
1090
    pxor        mm7, mm7
1091
    movq        mm0, [r0-8]
1092
    movq        mm3, [r0]
1093
    movq        mm1, [r0+8]
1094
    movq        mm2, mm3
1095
    movq        mm4, mm3
1096
    PALIGNR     mm2, mm0, 7, mm0
1097
    PALIGNR     mm1, mm4, 1, mm4
1098
    test         r1, r1 ; top_left
1099
    jz .fix_lt_2
1100
    test         r2, r2 ; top_right
1101
    jz .fix_tr_1
1102
    jmp .body
1103
.fix_lt_2:
1104
    movq        mm5, mm3
1105
    pxor        mm5, mm2
1106
    psllq       mm5, 56
1107
    psrlq       mm5, 56
1108
    pxor        mm2, mm5
1109
    test         r2, r2 ; top_right
1110
    jnz .body
1111
.fix_tr_1:
1112
    movq        mm5, mm3
1113
    pxor        mm5, mm1
1114
    psrlq       mm5, 56
1115
    psllq       mm5, 56
1116
    pxor        mm1, mm5
1117
.body
1118
    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1119
    psadbw   mm7, mm0
1120
    paddw    mm7, [pw_4]
1121
    psrlw    mm7, 3
1122
    pshufw   mm7, mm7, 0
1123
    packuswb mm7, mm7
1124
%rep 3
1125
    movq [r0+r3*1], mm7
1126
    movq [r0+r3*2], mm7
1127
    lea    r0, [r0+r3*2]
1128
%endrep
1129
    movq [r0+r3*1], mm7
1130
    movq [r0+r3*2], mm7
1131
    RET
1132
%endmacro
1133

    
1134
INIT_MMX
1135
%define PALIGNR PALIGNR_MMX
1136
PRED8x8L_TOP_DC mmxext
1137
%define PALIGNR PALIGNR_SSSE3
1138
PRED8x8L_TOP_DC ssse3
1139
%endif
1140

    
1141
;-----------------------------------------------------------------------------
1142
; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
1143
;-----------------------------------------------------------------------------
1144

    
1145
cglobal pred4x4_dc_mmxext, 3,5
1146
    pxor   mm7, mm7
1147
    mov     r4, r0
1148
    sub     r0, r2
1149
    movd   mm0, [r0]
1150
    psadbw mm0, mm7
1151
    movzx  r1d, byte [r0+r2*1-1]
1152
    movd   r3d, mm0
1153
    add    r3d, r1d
1154
    movzx  r1d, byte [r0+r2*2-1]
1155
    lea     r0, [r0+r2*2]
1156
    add    r3d, r1d
1157
    movzx  r1d, byte [r0+r2*1-1]
1158
    add    r3d, r1d
1159
    movzx  r1d, byte [r0+r2*2-1]
1160
    add    r3d, r1d
1161
    add    r3d, 4
1162
    shr    r3d, 3
1163
    imul   r3d, 0x01010101
1164
    mov   [r4+r2*0], r3d
1165
    mov   [r0+r2*0], r3d
1166
    mov   [r0+r2*1], r3d
1167
    mov   [r0+r2*2], r3d
1168
    RET
1169

    
1170
;-----------------------------------------------------------------------------
1171
; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
1172
;-----------------------------------------------------------------------------
1173

    
1174
%macro PRED4x4_TM_MMX 1
1175
cglobal pred4x4_tm_vp8_%1, 3,6
1176
    sub        r0, r2
1177
    pxor      mm7, mm7
1178
    movd      mm0, [r0]
1179
    punpcklbw mm0, mm7
1180
    movzx     r4d, byte [r0-1]
1181
    mov       r5d, 2
1182
.loop:
1183
    movzx     r1d, byte [r0+r2*1-1]
1184
    movzx     r3d, byte [r0+r2*2-1]
1185
    sub       r1d, r4d
1186
    sub       r3d, r4d
1187
    movd      mm2, r1d
1188
    movd      mm4, r3d
1189
%ifidn %1, mmx
1190
    punpcklwd mm2, mm2
1191
    punpcklwd mm4, mm4
1192
    punpckldq mm2, mm2
1193
    punpckldq mm4, mm4
1194
%else
1195
    pshufw    mm2, mm2, 0
1196
    pshufw    mm4, mm4, 0
1197
%endif
1198
    paddw     mm2, mm0
1199
    paddw     mm4, mm0
1200
    packuswb  mm2, mm2
1201
    packuswb  mm4, mm4
1202
    movd [r0+r2*1], mm2
1203
    movd [r0+r2*2], mm4
1204
    lea        r0, [r0+r2*2]
1205
    dec       r5d
1206
    jg .loop
1207
    REP_RET
1208
%endmacro
1209

    
1210
PRED4x4_TM_MMX mmx
1211
PRED4x4_TM_MMX mmxext
1212

    
1213
cglobal pred4x4_tm_vp8_ssse3, 3,3
1214
    sub         r0, r2
1215
    movq       mm6, [tm_shuf]
1216
    pxor       mm1, mm1
1217
    movd       mm0, [r0]
1218
    punpcklbw  mm0, mm1
1219
    movd       mm7, [r0-4]
1220
    pshufb     mm7, mm6
1221
    lea         r1, [r0+r2*2]
1222
    movd       mm2, [r0+r2*1-4]
1223
    movd       mm3, [r0+r2*2-4]
1224
    movd       mm4, [r1+r2*1-4]
1225
    movd       mm5, [r1+r2*2-4]
1226
    pshufb     mm2, mm6
1227
    pshufb     mm3, mm6
1228
    pshufb     mm4, mm6
1229
    pshufb     mm5, mm6
1230
    psubw      mm2, mm7
1231
    psubw      mm3, mm7
1232
    psubw      mm4, mm7
1233
    psubw      mm5, mm7
1234
    paddw      mm2, mm0
1235
    paddw      mm3, mm0
1236
    paddw      mm4, mm0
1237
    paddw      mm5, mm0
1238
    packuswb   mm2, mm2
1239
    packuswb   mm3, mm3
1240
    packuswb   mm4, mm4
1241
    packuswb   mm5, mm5
1242
    movd [r0+r2*1], mm2
1243
    movd [r0+r2*2], mm3
1244
    movd [r1+r2*1], mm4
1245
    movd [r1+r2*2], mm5
1246
    RET
1247

    
1248
;-----------------------------------------------------------------------------
1249
; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
1250
;-----------------------------------------------------------------------------
1251

    
1252
INIT_MMX
1253
cglobal pred4x4_vertical_vp8_mmxext, 3,3
1254
    sub       r0, r2
1255
    movd      m1, [r0-1]
1256
    movd      m0, [r0]
1257
    mova      m2, m0   ;t0 t1 t2 t3
1258
    punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
1259
    lea       r1, [r0+r2*2]
1260
    psrlq     m0, 8    ;t1 t2 t3 t4
1261
    PRED4x4_LOWPASS m3, m1, m0, m2, m4
1262
    movd [r0+r2*1], m3
1263
    movd [r0+r2*2], m3
1264
    movd [r1+r2*1], m3
1265
    movd [r1+r2*2], m3
1266
    RET
1267

    
1268
;-----------------------------------------------------------------------------
1269
; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
1270
;-----------------------------------------------------------------------------
1271
%ifdef CONFIG_GPL
1272
INIT_MMX
1273
cglobal pred4x4_down_left_mmxext, 3,3
1274
    sub       r0, r2
1275
    movq      m1, [r0]
1276
    punpckldq m1, [r1]
1277
    movq      m2, m1
1278
    movq      m3, m1
1279
    movq      m4, m1
1280
    psllq     m1, 8
1281
    pxor      m2, m1
1282
    psrlq     m2, 8
1283
    pxor      m3, m2
1284
    PRED4x4_LOWPASS m0, m1, m3, m4, m5
1285
    lea       r1, [r0+r2*2]
1286
    psrlq     m0, 8
1287
    movd      [r0+r2*1], m0
1288
    psrlq     m0, 8
1289
    movd      [r0+r2*2], m0
1290
    psrlq     m0, 8
1291
    movd      [r1+r2*1], m0
1292
    psrlq     m0, 8
1293
    movd      [r1+r2*2], m0
1294
    RET
1295
%endif