Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / dsputil_yasm.asm @ baffa091

History | View | Annotate | Download (25.2 KB)

1
;******************************************************************************
2
;* MMX optimized DSP utils
3
;* Copyright (c) 2008 Loren Merritt
4
;*
5
;* This file is part of FFmpeg.
6
;*
7
;* FFmpeg is free software; you can redistribute it and/or
8
;* modify it under the terms of the GNU Lesser General Public
9
;* License as published by the Free Software Foundation; either
10
;* version 2.1 of the License, or (at your option) any later version.
11
;*
12
;* FFmpeg is distributed in the hope that it will be useful,
13
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
;* Lesser General Public License for more details.
16
;*
17
;* You should have received a copy of the GNU Lesser General Public
18
;* License along with FFmpeg; if not, write to the Free Software
19
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
;******************************************************************************
21

    
22
%include "x86inc.asm"
23

    
24
SECTION_RODATA
25
pb_f: times 16 db 15
26
pb_zzzzzzzz77777777: times 8 db -1
27
pb_7: times 8 db 7
28
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
29
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
30

    
31
section .text align=16
32

    
33
%macro PSWAPD_SSE 2
34
    pshufw %1, %2, 0x4e
35
%endmacro
36
%macro PSWAPD_3DN1 2
37
    movq  %1, %2
38
    psrlq %1, 32
39
    punpckldq %1, %2
40
%endmacro
41

    
42
%macro FLOAT_TO_INT16_INTERLEAVE6 1
43
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
44
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
45
%ifdef ARCH_X86_64
46
    %define lend r10d
47
    mov     lend, r2d
48
%else
49
    %define lend dword r2m
50
%endif
51
    mov src1q, [srcq+1*gprsize]
52
    mov src2q, [srcq+2*gprsize]
53
    mov src3q, [srcq+3*gprsize]
54
    mov src4q, [srcq+4*gprsize]
55
    mov src5q, [srcq+5*gprsize]
56
    mov srcq,  [srcq]
57
    sub src1q, srcq
58
    sub src2q, srcq
59
    sub src3q, srcq
60
    sub src4q, srcq
61
    sub src5q, srcq
62
.loop:
63
    cvtps2pi   mm0, [srcq]
64
    cvtps2pi   mm1, [srcq+src1q]
65
    cvtps2pi   mm2, [srcq+src2q]
66
    cvtps2pi   mm3, [srcq+src3q]
67
    cvtps2pi   mm4, [srcq+src4q]
68
    cvtps2pi   mm5, [srcq+src5q]
69
    packssdw   mm0, mm3
70
    packssdw   mm1, mm4
71
    packssdw   mm2, mm5
72
    pswapd     mm3, mm0
73
    punpcklwd  mm0, mm1
74
    punpckhwd  mm1, mm2
75
    punpcklwd  mm2, mm3
76
    pswapd     mm3, mm0
77
    punpckldq  mm0, mm2
78
    punpckhdq  mm2, mm1
79
    punpckldq  mm1, mm3
80
    movq [dstq   ], mm0
81
    movq [dstq+16], mm2
82
    movq [dstq+ 8], mm1
83
    add srcq, 8
84
    add dstq, 24
85
    sub lend, 2
86
    jg .loop
87
    emms
88
    RET
89
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
90

    
91
%define pswapd PSWAPD_SSE
92
FLOAT_TO_INT16_INTERLEAVE6 sse
93
%define cvtps2pi pf2id
94
%define pswapd PSWAPD_3DN1
95
FLOAT_TO_INT16_INTERLEAVE6 3dnow
96
%undef pswapd
97
FLOAT_TO_INT16_INTERLEAVE6 3dn2
98
%undef cvtps2pi
99

    
100

    
101

    
102
%macro SCALARPRODUCT 1
103
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
104
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
105
    shl orderq, 1
106
    add v1q, orderq
107
    add v2q, orderq
108
    neg orderq
109
    movd    m3, shiftm
110
    pxor    m2, m2
111
.loop:
112
    movu    m0, [v1q + orderq]
113
    movu    m1, [v1q + orderq + mmsize]
114
    pmaddwd m0, [v2q + orderq]
115
    pmaddwd m1, [v2q + orderq + mmsize]
116
    paddd   m2, m0
117
    paddd   m2, m1
118
    add     orderq, mmsize*2
119
    jl .loop
120
%if mmsize == 16
121
    movhlps m0, m2
122
    paddd   m2, m0
123
    psrad   m2, m3
124
    pshuflw m0, m2, 0x4e
125
%else
126
    psrad   m2, m3
127
    pshufw  m0, m2, 0x4e
128
%endif
129
    paddd   m2, m0
130
    movd   eax, m2
131
    RET
132

    
133
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
134
cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
135
    shl orderq, 1
136
    movd    m7, mulm
137
%if mmsize == 16
138
    pshuflw m7, m7, 0
139
    punpcklqdq m7, m7
140
%else
141
    pshufw  m7, m7, 0
142
%endif
143
    pxor    m6, m6
144
    add v1q, orderq
145
    add v2q, orderq
146
    add v3q, orderq
147
    neg orderq
148
.loop:
149
    movu    m0, [v2q + orderq]
150
    movu    m1, [v2q + orderq + mmsize]
151
    mova    m4, [v1q + orderq]
152
    mova    m5, [v1q + orderq + mmsize]
153
    movu    m2, [v3q + orderq]
154
    movu    m3, [v3q + orderq + mmsize]
155
    pmaddwd m0, m4
156
    pmaddwd m1, m5
157
    pmullw  m2, m7
158
    pmullw  m3, m7
159
    paddd   m6, m0
160
    paddd   m6, m1
161
    paddw   m2, m4
162
    paddw   m3, m5
163
    mova    [v1q + orderq], m2
164
    mova    [v1q + orderq + mmsize], m3
165
    add     orderq, mmsize*2
166
    jl .loop
167
%if mmsize == 16
168
    movhlps m0, m6
169
    paddd   m6, m0
170
    pshuflw m0, m6, 0x4e
171
%else
172
    pshufw  m0, m6, 0x4e
173
%endif
174
    paddd   m6, m0
175
    movd   eax, m6
176
    RET
177
%endmacro
178

    
179
INIT_MMX
180
SCALARPRODUCT mmx2
181
INIT_XMM
182
SCALARPRODUCT sse2
183

    
184
%macro SCALARPRODUCT_LOOP 1
185
align 16
186
.loop%1:
187
    sub     orderq, mmsize*2
188
%if %1
189
    mova    m1, m4
190
    mova    m4, [v2q + orderq]
191
    mova    m0, [v2q + orderq + mmsize]
192
    palignr m1, m0, %1
193
    palignr m0, m4, %1
194
    mova    m3, m5
195
    mova    m5, [v3q + orderq]
196
    mova    m2, [v3q + orderq + mmsize]
197
    palignr m3, m2, %1
198
    palignr m2, m5, %1
199
%else
200
    mova    m0, [v2q + orderq]
201
    mova    m1, [v2q + orderq + mmsize]
202
    mova    m2, [v3q + orderq]
203
    mova    m3, [v3q + orderq + mmsize]
204
%endif
205
    %define t0  [v1q + orderq]
206
    %define t1  [v1q + orderq + mmsize]
207
%ifdef ARCH_X86_64
208
    mova    m8, t0
209
    mova    m9, t1
210
    %define t0  m8
211
    %define t1  m9
212
%endif
213
    pmaddwd m0, t0
214
    pmaddwd m1, t1
215
    pmullw  m2, m7
216
    pmullw  m3, m7
217
    paddw   m2, t0
218
    paddw   m3, t1
219
    paddd   m6, m0
220
    paddd   m6, m1
221
    mova    [v1q + orderq], m2
222
    mova    [v1q + orderq + mmsize], m3
223
    jg .loop%1
224
%if %1
225
    jmp .end
226
%endif
227
%endmacro
228

    
229
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
230
cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
231
    shl orderq, 1
232
    movd    m7, mulm
233
    pshuflw m7, m7, 0
234
    punpcklqdq m7, m7
235
    pxor    m6, m6
236
    mov    r4d, v2d
237
    and    r4d, 15
238
    and    v2q, ~15
239
    and    v3q, ~15
240
    mova    m4, [v2q + orderq]
241
    mova    m5, [v3q + orderq]
242
    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
243
    cmp    r4d, 0
244
    je .loop0
245
    cmp    r4d, 2
246
    je .loop2
247
    cmp    r4d, 4
248
    je .loop4
249
    cmp    r4d, 6
250
    je .loop6
251
    cmp    r4d, 8
252
    je .loop8
253
    cmp    r4d, 10
254
    je .loop10
255
    cmp    r4d, 12
256
    je .loop12
257
SCALARPRODUCT_LOOP 14
258
SCALARPRODUCT_LOOP 12
259
SCALARPRODUCT_LOOP 10
260
SCALARPRODUCT_LOOP 8
261
SCALARPRODUCT_LOOP 6
262
SCALARPRODUCT_LOOP 4
263
SCALARPRODUCT_LOOP 2
264
SCALARPRODUCT_LOOP 0
265
.end:
266
    movhlps m0, m6
267
    paddd   m6, m0
268
    pshuflw m0, m6, 0x4e
269
    paddd   m6, m0
270
    movd   eax, m6
271
    RET
272

    
273

    
274

    
275
; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
276
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
277
    movq    mm0, [topq]
278
    movq    mm2, mm0
279
    movd    mm4, [left_topq]
280
    psllq   mm2, 8
281
    movq    mm1, mm0
282
    por     mm4, mm2
283
    movd    mm3, [leftq]
284
    psubb   mm0, mm4 ; t-tl
285
    add    dstq, wq
286
    add    topq, wq
287
    add   diffq, wq
288
    neg      wq
289
    jmp .skip
290
.loop:
291
    movq    mm4, [topq+wq]
292
    movq    mm0, mm4
293
    psllq   mm4, 8
294
    por     mm4, mm1
295
    movq    mm1, mm0 ; t
296
    psubb   mm0, mm4 ; t-tl
297
.skip:
298
    movq    mm2, [diffq+wq]
299
%assign i 0
300
%rep 8
301
    movq    mm4, mm0
302
    paddb   mm4, mm3 ; t-tl+l
303
    movq    mm5, mm3
304
    pmaxub  mm3, mm1
305
    pminub  mm5, mm1
306
    pminub  mm3, mm4
307
    pmaxub  mm3, mm5 ; median
308
    paddb   mm3, mm2 ; +residual
309
%if i==0
310
    movq    mm7, mm3
311
    psllq   mm7, 56
312
%else
313
    movq    mm6, mm3
314
    psrlq   mm7, 8
315
    psllq   mm6, 56
316
    por     mm7, mm6
317
%endif
318
%if i<7
319
    psrlq   mm0, 8
320
    psrlq   mm1, 8
321
    psrlq   mm2, 8
322
%endif
323
%assign i i+1
324
%endrep
325
    movq [dstq+wq], mm7
326
    add      wq, 8
327
    jl .loop
328
    movzx   r2d, byte [dstq-1]
329
    mov [leftq], r2d
330
    movzx   r2d, byte [topq-1]
331
    mov [left_topq], r2d
332
    RET
333

    
334

    
335
%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
336
    add     srcq, wq
337
    add     dstq, wq
338
    neg     wq
339
%%.loop:
340
    mova    m1, [srcq+wq]
341
    mova    m2, m1
342
    psllw   m1, 8
343
    paddb   m1, m2
344
    mova    m2, m1
345
    pshufb  m1, m3
346
    paddb   m1, m2
347
    pshufb  m0, m5
348
    mova    m2, m1
349
    pshufb  m1, m4
350
    paddb   m1, m2
351
%if mmsize == 16
352
    mova    m2, m1
353
    pshufb  m1, m6
354
    paddb   m1, m2
355
%endif
356
    paddb   m0, m1
357
%if %1
358
    mova    [dstq+wq], m0
359
%else
360
    movq    [dstq+wq], m0
361
    movhps  [dstq+wq+8], m0
362
%endif
363
    add     wq, mmsize
364
    jl %%.loop
365
    mov     eax, mmsize-1
366
    sub     eax, wd
367
    movd    m1, eax
368
    pshufb  m0, m1
369
    movd    eax, m0
370
    RET
371
%endmacro
372

    
373
; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
374
INIT_MMX
375
cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
376
.skip_prologue:
377
    mova    m5, [pb_7]
378
    mova    m4, [pb_zzzz3333zzzzbbbb]
379
    mova    m3, [pb_zz11zz55zz99zzdd]
380
    movd    m0, leftm
381
    psllq   m0, 56
382
    ADD_HFYU_LEFT_LOOP 1
383

    
384
INIT_XMM
385
cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
386
    mova    m5, [pb_f]
387
    mova    m6, [pb_zzzzzzzz77777777]
388
    mova    m4, [pb_zzzz3333zzzzbbbb]
389
    mova    m3, [pb_zz11zz55zz99zzdd]
390
    movd    m0, leftm
391
    pslldq  m0, 15
392
    test    srcq, 15
393
    jnz add_hfyu_left_prediction_ssse3.skip_prologue
394
    test    dstq, 15
395
    jnz .unaligned
396
    ADD_HFYU_LEFT_LOOP 1
397
.unaligned:
398
    ADD_HFYU_LEFT_LOOP 0
399

    
400

    
401
; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
402
cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
403
    neg offsetq
404
    shl offsetq, 2
405
    sub v1q, offsetq
406
    sub v2q, offsetq
407
    xorps xmm0, xmm0
408
    .loop:
409
        movaps   xmm1, [v1q+offsetq]
410
        mulps    xmm1, [v2q+offsetq]
411
        addps    xmm0, xmm1
412
        add      offsetq, 16
413
        js       .loop
414
    movhlps xmm1, xmm0
415
    addps   xmm0, xmm1
416
    movss   xmm1, xmm0
417
    shufps  xmm0, xmm0, 1
418
    addss   xmm0, xmm1
419
%ifndef ARCH_X86_64
420
    movd    r0m,  xmm0
421
    fld     dword r0m
422
%endif
423
    RET
424

    
425
; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
426
;                              x86_reg start_y, x86_reg end_y, x86_reg block_h,
427
;                              x86_reg start_x, x86_reg end_x, x86_reg block_w);
428
;
429
; The actual function itself is below. It basically wraps a very simple
430
; w = end_x - start_x
431
; if (w) {
432
;   if (w > 22) {
433
;     jump to the slow loop functions
434
;   } else {
435
;     jump to the fast loop functions
436
;   }
437
; }
438
;
439
; ... and then the same for left/right extend also. See below for loop
440
; function implementations. Fast are fixed-width, slow is variable-width
441

    
442
%macro EMU_EDGE_FUNC 1
443
%ifdef ARCH_X86_64
444
%define w_reg r10
445
cglobal emu_edge_core_%1, 6, 7, 1
446
    mov        r11, r5          ; save block_h
447
%else
448
%define w_reg r6
449
cglobal emu_edge_core_%1, 2, 7, 0
450
    mov         r4, r4m         ; end_y
451
    mov         r5, r5m         ; block_h
452
%endif
453

    
454
    ; start with vertical extend (top/bottom) and body pixel copy
455
    mov      w_reg, r7m
456
    sub      w_reg, r6m         ; w = start_x - end_x
457
    sub         r5, r4
458
%ifdef ARCH_X86_64
459
    sub         r4, r3
460
%else
461
    sub         r4, dword r3m
462
%endif
463
    cmp      w_reg, 22
464
    jg .slow_v_extend_loop
465
%ifdef ARCH_X86_32
466
    mov         r2, r2m         ; linesize
467
%endif
468
    sal      w_reg, 7           ; w * 128
469
%ifdef PIC
470
    lea        rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
471
    add      w_reg, rax
472
%else
473
    lea      w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
474
%endif
475
    call     w_reg              ; fast top extend, body copy and bottom extend
476
.v_extend_end:
477

    
478
    ; horizontal extend (left/right)
479
    mov      w_reg, r6m         ; start_x
480
    sub         r0, w_reg
481
%ifdef ARCH_X86_64
482
    mov         r3, r0          ; backup of buf+block_h*linesize
483
    mov         r5, r11
484
%else
485
    mov        r0m, r0          ; backup of buf+block_h*linesize
486
    mov         r5, r5m
487
%endif
488
    test     w_reg, w_reg
489
    jz .right_extend
490
    cmp      w_reg, 22
491
    jg .slow_left_extend_loop
492
    mov         r1, w_reg
493
    dec      w_reg
494
    ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
495
    sar      w_reg, 1
496
    sal      w_reg, 6
497
    ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
498
    ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
499
%ifdef PIC
500
    lea        rax, [.emuedge_extend_left_2]
501
    add      w_reg, rax
502
%else
503
    lea      w_reg, [.emuedge_extend_left_2+w_reg]
504
%endif
505
    call     w_reg
506

    
507
    ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
508
.right_extend:
509
%ifdef ARCH_X86_32
510
    mov         r0, r0m
511
    mov         r5, r5m
512
%endif
513
    mov      w_reg, r7m         ; end_x
514
    mov         r1, r8m         ; block_w
515
    mov         r4, r1
516
    sub         r1, w_reg
517
    jz .h_extend_end            ; if (end_x == block_w) goto h_extend_end
518
    cmp         r1, 22
519
    jg .slow_right_extend_loop
520
    dec         r1
521
    ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
522
    sar         r1, 1
523
    sal         r1, 6
524
%ifdef PIC
525
    lea        rax, [.emuedge_extend_right_2]
526
    add         r1, rax
527
%else
528
    lea         r1, [.emuedge_extend_right_2+r1]
529
%endif
530
    call        r1
531
.h_extend_end:
532
    RET
533

    
534
%ifdef ARCH_X86_64
535
%define vall  al
536
%define valh  ah
537
%define valw  ax
538
%define valw2 r10w
539
%define valw3 r3w
540
%define vald eax
541
%else
542
%define vall  bl
543
%define valh  bh
544
%define valw  bx
545
%define valw2 r6w
546
%define valw3 valw2
547
%define vald ebx
548
%define stack_offset 0x14
549
%endif
550

    
551
%endmacro
552

    
553
; macro to read/write a horizontal number of pixels (%2) to/from registers
554
; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
555
;            - if (%2 & 15 == 8) fills the last 8 bytes into rax
556
;            - else if (%2 & 8)  fills 8 bytes into mm0
557
;            - if (%2 & 7 == 4)  fills the last 4 bytes into rax
558
;            - else if (%2 & 4)  fills 4 bytes into mm0-1
559
;            - if (%2 & 3 == 3)  fills 2 bytes into r10/r3, and 1 into eax
560
;              (note that we're using r3 for body/bottom because it's a shorter
561
;               opcode, and then the loop fits in 128 bytes)
562
;            - else              fills remaining bytes into rax
563
; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
564
;            - if (%2 & 7 == 4)  fills 4 bytes into ebx
565
;            - else if (%2 & 4)  fills 4 bytes into mm0-7
566
;            - if (%2 & 3 == 3)  fills 2 bytes into r6, and 1 into ebx
567
;            - else              fills remaining bytes into ebx
568
; writing data out is in the same way
569
%macro READ_NUM_BYTES 3
570
%assign %%src_off 0 ; offset in source buffer
571
%assign %%smidx   0 ; mmx register idx
572
%assign %%sxidx   0 ; xmm register idx
573

    
574
%ifnidn %3, mmx
575
%rep %2/16
576
    movdqu xmm %+ %%sxidx, [r1+%%src_off]
577
%assign %%src_off %%src_off+16
578
%assign %%sxidx   %%sxidx+1
579
%endrep ; %2/16
580
%endif ; !mmx
581

    
582
%ifdef ARCH_X86_64
583
%if (%2-%%src_off) == 8
584
    mov           rax, [r1+%%src_off]
585
%assign %%src_off %%src_off+8
586
%endif ; (%2-%%src_off) == 8
587
%endif ; x86-64
588

    
589
%rep (%2-%%src_off)/8
590
    movq    mm %+ %%smidx, [r1+%%src_off]
591
%assign %%src_off %%src_off+8
592
%assign %%smidx   %%smidx+1
593
%endrep ; (%2-%%dst_off)/8
594

    
595
%if (%2-%%src_off) == 4
596
    mov          vald, [r1+%%src_off]
597
%elif (%2-%%src_off) & 4
598
    movd    mm %+ %%smidx, [r1+%%src_off]
599
%assign %%src_off %%src_off+4
600
%endif ; (%2-%%src_off) ==/& 4
601

    
602
%if (%2-%%src_off) == 1
603
    mov          vall, [r1+%%src_off]
604
%elif (%2-%%src_off) == 2
605
    mov          valw, [r1+%%src_off]
606
%elif (%2-%%src_off) == 3
607
%ifidn %1, top
608
    mov         valw2, [r1+%%src_off]
609
%else ; %1 != top
610
    mov         valw3, [r1+%%src_off]
611
%endif ; %1 ==/!= top
612
    mov          vall, [r1+%%src_off+2]
613
%endif ; (%2-%%src_off) == 1/2/3
614
%endmacro ; READ_NUM_BYTES
615

    
616
%macro WRITE_NUM_BYTES 3
617
%assign %%dst_off 0 ; offset in destination buffer
618
%assign %%dmidx   0 ; mmx register idx
619
%assign %%dxidx   0 ; xmm register idx
620

    
621
%ifnidn %3, mmx
622
%rep %2/16
623
    movdqu [r0+%%dst_off], xmm %+ %%dxidx
624
%assign %%dst_off %%dst_off+16
625
%assign %%dxidx   %%dxidx+1
626
%endrep ; %2/16
627
%endif
628

    
629
%ifdef ARCH_X86_64
630
%if (%2-%%dst_off) == 8
631
    mov    [r0+%%dst_off], rax
632
%assign %%dst_off %%dst_off+8
633
%endif ; (%2-%%dst_off) == 8
634
%endif ; x86-64
635

    
636
%rep (%2-%%dst_off)/8
637
    movq   [r0+%%dst_off], mm %+ %%dmidx
638
%assign %%dst_off %%dst_off+8
639
%assign %%dmidx   %%dmidx+1
640
%endrep ; (%2-%%dst_off)/8
641

    
642
%if (%2-%%dst_off) == 4
643
    mov    [r0+%%dst_off], vald
644
%elif (%2-%%dst_off) & 4
645
    movd   [r0+%%dst_off], mm %+ %%dmidx
646
%assign %%dst_off %%dst_off+4
647
%endif ; (%2-%%dst_off) ==/& 4
648

    
649
%if (%2-%%dst_off) == 1
650
    mov    [r0+%%dst_off], vall
651
%elif (%2-%%dst_off) == 2
652
    mov    [r0+%%dst_off], valw
653
%elif (%2-%%dst_off) == 3
654
%ifidn %1, top
655
    mov    [r0+%%dst_off], valw2
656
%else ; %1 != top
657
    mov    [r0+%%dst_off], valw3
658
%endif ; %1 ==/!= top
659
    mov  [r0+%%dst_off+2], vall
660
%endif ; (%2-%%dst_off) == 1/2/3
661
%endmacro ; WRITE_NUM_BYTES
662

    
663
; vertical top/bottom extend and body copy fast loops
664
; these are function pointers to set-width line copy functions, i.e.
665
; they read a fixed number of pixels into set registers, and write
666
; those out into the destination buffer
667
; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
668
; r6(eax/64)/r3(ebx/32)=val_reg
669
%macro VERTICAL_EXTEND 1
670
%assign %%n 1
671
%rep 22
672
ALIGN 128
673
.emuedge_v_extend_ %+ %%n:
674
    ; extend pixels above body
675
%ifdef ARCH_X86_64
676
    test           r3 , r3                   ; if (!start_y)
677
    jz .emuedge_copy_body_ %+ %%n %+ _loop   ;   goto body
678
%else ; ARCH_X86_32
679
    cmp      dword r3m, 0
680
    je .emuedge_copy_body_ %+ %%n %+ _loop
681
%endif ; ARCH_X86_64/32
682
    READ_NUM_BYTES  top,    %%n, %1          ; read bytes
683
.emuedge_extend_top_ %+ %%n %+ _loop:        ; do {
684
    WRITE_NUM_BYTES top,    %%n, %1          ;   write bytes
685
    add            r0 , r2                   ;   dst += linesize
686
%ifdef ARCH_X86_64
687
    dec            r3
688
%else ; ARCH_X86_32
689
    dec      dword r3m
690
%endif ; ARCH_X86_64/32
691
    jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
692

    
693
    ; copy body pixels
694
.emuedge_copy_body_ %+ %%n %+ _loop:         ; do {
695
    READ_NUM_BYTES  body,   %%n, %1          ;   read bytes
696
    WRITE_NUM_BYTES body,   %%n, %1          ;   write bytes
697
    add            r0 , r2                   ;   dst += linesize
698
    add            r1 , r2                   ;   src += linesize
699
    dec            r4
700
    jnz .emuedge_copy_body_ %+ %%n %+ _loop  ; } while (--end_y)
701

    
702
    ; copy bottom pixels
703
    test           r5 , r5                   ; if (!block_h)
704
    jz .emuedge_v_extend_end_ %+ %%n         ;   goto end
705
    sub            r1 , r2                   ; src -= linesize
706
    READ_NUM_BYTES  bottom, %%n, %1          ; read bytes
707
.emuedge_extend_bottom_ %+ %%n %+ _loop:     ; do {
708
    WRITE_NUM_BYTES bottom, %%n, %1          ;   write bytes
709
    add            r0 , r2                   ;   dst += linesize
710
    dec            r5
711
    jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
712

    
713
.emuedge_v_extend_end_ %+ %%n:
714
%ifdef ARCH_X86_64
715
    ret
716
%else ; ARCH_X86_32
717
    rep ret
718
%endif ; ARCH_X86_64/32
719
%assign %%n %%n+1
720
%endrep
721
%endmacro VERTICAL_EXTEND
722

    
723
; left/right (horizontal) fast extend functions
724
; these are essentially identical to the vertical extend ones above,
725
; just left/right separated because number of pixels to extend is
726
; obviously not the same on both sides.
727
; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
728
; lowest two bytes of the register (so val*0x0101), and are splatted
729
; into each byte of mm0 as well if n_pixels >= 8
730

    
731
%macro READ_V_PIXEL 3
732
    mov        vall, %2
733
    mov        valh, vall
734
%if %1 >= 8
735
    movd        mm0, vald
736
%ifidn %3, mmx
737
    punpcklwd   mm0, mm0
738
    punpckldq   mm0, mm0
739
%else ; !mmx
740
    pshufw      mm0, mm0, 0
741
%endif ; mmx
742
%endif ; %1 >= 8
743
%endmacro
744

    
745
%macro WRITE_V_PIXEL 2
746
%assign %%dst_off 0
747
%rep %1/8
748
    movq [%2+%%dst_off], mm0
749
%assign %%dst_off %%dst_off+8
750
%endrep
751
%if %1 & 4
752
%if %1 >= 8
753
    movd [%2+%%dst_off], mm0
754
%else ; %1 < 8
755
    mov  [%2+%%dst_off]  , valw
756
    mov  [%2+%%dst_off+2], valw
757
%endif ; %1 >=/< 8
758
%assign %%dst_off %%dst_off+4
759
%endif ; %1 & 4
760
%if %1&2
761
    mov  [%2+%%dst_off], valw
762
%endif ; %1 & 2
763
%endmacro
764

    
765
; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
766
%macro LEFT_EXTEND 1
767
%assign %%n 2
768
%rep 11
769
ALIGN 64
770
.emuedge_extend_left_ %+ %%n:          ; do {
771
    sub         r0, r2                 ;   dst -= linesize
772
    READ_V_PIXEL  %%n, [r0+r1], %1     ;   read pixels
773
    WRITE_V_PIXEL %%n, r0              ;   write pixels
774
    dec         r5
775
    jnz .emuedge_extend_left_ %+ %%n   ; } while (--block_h)
776
%ifdef ARCH_X86_64
777
    ret
778
%else ; ARCH_X86_32
779
    rep ret
780
%endif ; ARCH_X86_64/32
781
%assign %%n %%n+2
782
%endrep
783
%endmacro ; LEFT_EXTEND
784

    
785
; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
786
%macro RIGHT_EXTEND 1
787
%assign %%n 2
788
%rep 11
789
ALIGN 64
790
.emuedge_extend_right_ %+ %%n:          ; do {
791
%ifdef ARCH_X86_64
792
    sub        r3, r2                   ;   dst -= linesize
793
    READ_V_PIXEL  %%n, [r3+w_reg-1], %1 ;   read pixels
794
    WRITE_V_PIXEL %%n, r3+r4-%%n        ;   write pixels
795
    dec       r11
796
%else ; ARCH_X86_32
797
    sub        r0, r2                   ;   dst -= linesize
798
    READ_V_PIXEL  %%n, [r0+w_reg-1], %1 ;   read pixels
799
    WRITE_V_PIXEL %%n, r0+r4-%%n        ;   write pixels
800
    dec     r5
801
%endif ; ARCH_X86_64/32
802
    jnz .emuedge_extend_right_ %+ %%n   ; } while (--block_h)
803
%ifdef ARCH_X86_64
804
    ret
805
%else ; ARCH_X86_32
806
    rep ret
807
%endif ; ARCH_X86_64/32
808
%assign %%n %%n+2
809
%endrep
810

    
811
%ifdef ARCH_X86_32
812
%define stack_offset 0x10
813
%endif
814
%endmacro ; RIGHT_EXTEND
815

    
816
; below follow the "slow" copy/extend functions, these act on a non-fixed
817
; width specified in a register, and run a loop to copy the full amount
818
; of bytes. They are optimized for copying of large amounts of pixels per
819
; line, so they unconditionally splat data into mm registers to copy 8
820
; bytes per loop iteration. It could be considered to use xmm for x86-64
821
; also, but I haven't optimized this as much (i.e. FIXME)
822
%macro V_COPY_NPX 4-5
823
%if %0 == 4
824
    test     w_reg, %4
825
    jz .%1_skip_%4_px
826
%else ; %0 == 5
827
.%1_%4_px_loop:
828
%endif
829
    %3          %2, [r1+cnt_reg]
830
    %3 [r0+cnt_reg], %2
831
    add    cnt_reg, %4
832
%if %0 == 5
833
    sub      w_reg, %4
834
    test     w_reg, %5
835
    jnz .%1_%4_px_loop
836
%endif
837
.%1_skip_%4_px:
838
%endmacro
839

    
840
%macro V_COPY_ROW 3
841
%ifidn %1, bottom
842
    sub         r1, linesize
843
%endif
844
.%1_copy_loop:
845
    xor    cnt_reg, cnt_reg
846
%ifidn %3, mmx
847
%define linesize r2m
848
    V_COPY_NPX %1,  mm0, movq,    8, 0xFFFFFFF8
849
%else ; !mmx
850
    V_COPY_NPX %1, xmm0, movdqu, 16, 0xFFFFFFF0
851
%ifdef ARCH_X86_64
852
%define linesize r2
853
    V_COPY_NPX %1, rax , mov,     8
854
%else ; ARCH_X86_32
855
%define linesize r2m
856
    V_COPY_NPX %1,  mm0, movq,    8
857
%endif ; ARCH_X86_64/32
858
%endif ; mmx
859
    V_COPY_NPX %1, vald, mov,     4
860
    V_COPY_NPX %1, valw, mov,     2
861
    V_COPY_NPX %1, vall, mov,     1
862
    mov      w_reg, cnt_reg
863
%ifidn %1, body
864
    add         r1, linesize
865
%endif
866
    add         r0, linesize
867
    dec         %2
868
    jnz .%1_copy_loop
869
%endmacro
870

    
871
%macro SLOW_V_EXTEND 1
872
.slow_v_extend_loop:
873
; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
874
; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
875
%ifdef ARCH_X86_64
876
    push       r11              ; save old value of block_h
877
    test        r3, r3
878
%define cnt_reg r11
879
    jz .do_body_copy            ; if (!start_y) goto do_body_copy
880
    V_COPY_ROW top, r3, %1
881
%else
882
    cmp  dword r3m, 0
883
%define cnt_reg r2
884
    je .do_body_copy            ; if (!start_y) goto do_body_copy
885
    V_COPY_ROW top, dword r3m, %1
886
%endif
887

    
888
.do_body_copy:
889
    V_COPY_ROW body, r4, %1
890

    
891
%ifdef ARCH_X86_64
892
    pop        r11              ; restore old value of block_h
893
%define cnt_reg r3
894
%endif
895
    test        r5, r5
896
%ifdef ARCH_X86_64
897
    jz .v_extend_end
898
%else
899
    jz .skip_bottom_extend
900
%endif
901
    V_COPY_ROW bottom, r5, %1
902
%ifdef ARCH_X86_32
903
.skip_bottom_extend:
904
    mov         r2, r2m
905
%endif
906
    jmp .v_extend_end
907
%endmacro
908

    
909
%macro SLOW_LEFT_EXTEND 1
910
.slow_left_extend_loop:
911
; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
912
    mov         r4, 8
913
    sub         r0, linesize
914
    READ_V_PIXEL 8, [r0+w_reg], %1
915
.left_extend_8px_loop:
916
    movq [r0+r4-8], mm0
917
    add         r4, 8
918
    cmp         r4, w_reg
919
    jle .left_extend_8px_loop
920
    sub         r4, 8
921
    cmp         r4, w_reg
922
    jge .left_extend_loop_end
923
.left_extend_2px_loop:
924
    mov    [r0+r4], valw
925
    add         r4, 2
926
    cmp         r4, w_reg
927
    jl .left_extend_2px_loop
928
.left_extend_loop_end:
929
    dec         r5
930
    jnz .slow_left_extend_loop
931
%ifdef ARCH_X86_32
932
    mov         r2, r2m
933
%endif
934
    jmp .right_extend
935
%endmacro
936

    
937
%macro SLOW_RIGHT_EXTEND 1
938
.slow_right_extend_loop:
939
; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
940
; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
941
%ifdef ARCH_X86_64
942
%define buf_reg r3
943
%define bh_reg r11
944
%else
945
%define buf_reg r0
946
%define bh_reg r5
947
%endif
948
    lea         r1, [r4-8]
949
    sub    buf_reg, linesize
950
    READ_V_PIXEL 8, [buf_reg+w_reg-1], %1
951
.right_extend_8px_loop:
952
    movq [buf_reg+r1], mm0
953
    sub         r1, 8
954
    cmp         r1, w_reg
955
    jge .right_extend_8px_loop
956
    add         r1, 8
957
    cmp         r1, w_reg
958
    je .right_extend_loop_end
959
.right_extend_2px_loop:
960
    sub         r1, 2
961
    mov [buf_reg+r1], valw
962
    cmp         r1, w_reg
963
    jg .right_extend_2px_loop
964
.right_extend_loop_end:
965
    dec         bh_reg
966
    jnz .slow_right_extend_loop
967
    jmp .h_extend_end
968
%endmacro
969

    
970
%macro emu_edge 1
971
EMU_EDGE_FUNC     %1
972
VERTICAL_EXTEND   %1
973
LEFT_EXTEND       %1
974
RIGHT_EXTEND      %1
975
SLOW_V_EXTEND     %1
976
SLOW_LEFT_EXTEND  %1
977
SLOW_RIGHT_EXTEND %1
978
%endmacro
979

    
980
emu_edge sse
981
%ifdef ARCH_X86_32
982
emu_edge mmx
983
%endif