Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / dsputil_yasm.asm @ e6e98234

History | View | Annotate | Download (27.6 KB)

1
;******************************************************************************
2
;* MMX optimized DSP utils
3
;* Copyright (c) 2008 Loren Merritt
4
;*
5
;* This file is part of Libav.
6
;*
7
;* Libav is free software; you can redistribute it and/or
8
;* modify it under the terms of the GNU Lesser General Public
9
;* License as published by the Free Software Foundation; either
10
;* version 2.1 of the License, or (at your option) any later version.
11
;*
12
;* Libav is distributed in the hope that it will be useful,
13
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
;* Lesser General Public License for more details.
16
;*
17
;* You should have received a copy of the GNU Lesser General Public
18
;* License along with Libav; if not, write to the Free Software
19
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
;******************************************************************************
21

    
22
%include "x86inc.asm"
23

    
24
SECTION_RODATA
25
pb_f: times 16 db 15
26
pb_zzzzzzzz77777777: times 8 db -1
27
pb_7: times 8 db 7
28
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
29
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
30
pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
31
pd_16384: times 4 dd 16384
32

    
33
section .text align=16
34

    
35
%macro SCALARPRODUCT 1
36
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
37
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
38
    shl orderq, 1
39
    add v1q, orderq
40
    add v2q, orderq
41
    neg orderq
42
    movd    m3, shiftm
43
    pxor    m2, m2
44
.loop:
45
    movu    m0, [v1q + orderq]
46
    movu    m1, [v1q + orderq + mmsize]
47
    pmaddwd m0, [v2q + orderq]
48
    pmaddwd m1, [v2q + orderq + mmsize]
49
    paddd   m2, m0
50
    paddd   m2, m1
51
    add     orderq, mmsize*2
52
    jl .loop
53
%if mmsize == 16
54
    movhlps m0, m2
55
    paddd   m2, m0
56
    psrad   m2, m3
57
    pshuflw m0, m2, 0x4e
58
%else
59
    psrad   m2, m3
60
    pshufw  m0, m2, 0x4e
61
%endif
62
    paddd   m2, m0
63
    movd   eax, m2
64
    RET
65

    
66
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
67
cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
68
    shl orderq, 1
69
    movd    m7, mulm
70
%if mmsize == 16
71
    pshuflw m7, m7, 0
72
    punpcklqdq m7, m7
73
%else
74
    pshufw  m7, m7, 0
75
%endif
76
    pxor    m6, m6
77
    add v1q, orderq
78
    add v2q, orderq
79
    add v3q, orderq
80
    neg orderq
81
.loop:
82
    movu    m0, [v2q + orderq]
83
    movu    m1, [v2q + orderq + mmsize]
84
    mova    m4, [v1q + orderq]
85
    mova    m5, [v1q + orderq + mmsize]
86
    movu    m2, [v3q + orderq]
87
    movu    m3, [v3q + orderq + mmsize]
88
    pmaddwd m0, m4
89
    pmaddwd m1, m5
90
    pmullw  m2, m7
91
    pmullw  m3, m7
92
    paddd   m6, m0
93
    paddd   m6, m1
94
    paddw   m2, m4
95
    paddw   m3, m5
96
    mova    [v1q + orderq], m2
97
    mova    [v1q + orderq + mmsize], m3
98
    add     orderq, mmsize*2
99
    jl .loop
100
%if mmsize == 16
101
    movhlps m0, m6
102
    paddd   m6, m0
103
    pshuflw m0, m6, 0x4e
104
%else
105
    pshufw  m0, m6, 0x4e
106
%endif
107
    paddd   m6, m0
108
    movd   eax, m6
109
    RET
110
%endmacro
111

    
112
INIT_MMX
113
SCALARPRODUCT mmx2
114
INIT_XMM
115
SCALARPRODUCT sse2
116

    
117
%macro SCALARPRODUCT_LOOP 1
118
align 16
119
.loop%1:
120
    sub     orderq, mmsize*2
121
%if %1
122
    mova    m1, m4
123
    mova    m4, [v2q + orderq]
124
    mova    m0, [v2q + orderq + mmsize]
125
    palignr m1, m0, %1
126
    palignr m0, m4, %1
127
    mova    m3, m5
128
    mova    m5, [v3q + orderq]
129
    mova    m2, [v3q + orderq + mmsize]
130
    palignr m3, m2, %1
131
    palignr m2, m5, %1
132
%else
133
    mova    m0, [v2q + orderq]
134
    mova    m1, [v2q + orderq + mmsize]
135
    mova    m2, [v3q + orderq]
136
    mova    m3, [v3q + orderq + mmsize]
137
%endif
138
    %define t0  [v1q + orderq]
139
    %define t1  [v1q + orderq + mmsize]
140
%ifdef ARCH_X86_64
141
    mova    m8, t0
142
    mova    m9, t1
143
    %define t0  m8
144
    %define t1  m9
145
%endif
146
    pmaddwd m0, t0
147
    pmaddwd m1, t1
148
    pmullw  m2, m7
149
    pmullw  m3, m7
150
    paddw   m2, t0
151
    paddw   m3, t1
152
    paddd   m6, m0
153
    paddd   m6, m1
154
    mova    [v1q + orderq], m2
155
    mova    [v1q + orderq + mmsize], m3
156
    jg .loop%1
157
%if %1
158
    jmp .end
159
%endif
160
%endmacro
161

    
162
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
163
cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
164
    shl orderq, 1
165
    movd    m7, mulm
166
    pshuflw m7, m7, 0
167
    punpcklqdq m7, m7
168
    pxor    m6, m6
169
    mov    r4d, v2d
170
    and    r4d, 15
171
    and    v2q, ~15
172
    and    v3q, ~15
173
    mova    m4, [v2q + orderq]
174
    mova    m5, [v3q + orderq]
175
    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
176
    cmp    r4d, 0
177
    je .loop0
178
    cmp    r4d, 2
179
    je .loop2
180
    cmp    r4d, 4
181
    je .loop4
182
    cmp    r4d, 6
183
    je .loop6
184
    cmp    r4d, 8
185
    je .loop8
186
    cmp    r4d, 10
187
    je .loop10
188
    cmp    r4d, 12
189
    je .loop12
190
SCALARPRODUCT_LOOP 14
191
SCALARPRODUCT_LOOP 12
192
SCALARPRODUCT_LOOP 10
193
SCALARPRODUCT_LOOP 8
194
SCALARPRODUCT_LOOP 6
195
SCALARPRODUCT_LOOP 4
196
SCALARPRODUCT_LOOP 2
197
SCALARPRODUCT_LOOP 0
198
.end:
199
    movhlps m0, m6
200
    paddd   m6, m0
201
    pshuflw m0, m6, 0x4e
202
    paddd   m6, m0
203
    movd   eax, m6
204
    RET
205

    
206

    
207
;-----------------------------------------------------------------------------
208
; void ff_apply_window_int16(int16_t *output, const int16_t *input,
209
;                            const int16_t *window, unsigned int len)
210
;-----------------------------------------------------------------------------
211

    
212
%macro REVERSE_WORDS_MMXEXT 1-2
213
    pshufw   %1, %1, 0x1B
214
%endmacro
215

    
216
%macro REVERSE_WORDS_SSE2 1-2
217
    pshuflw  %1, %1, 0x1B
218
    pshufhw  %1, %1, 0x1B
219
    pshufd   %1, %1, 0x4E
220
%endmacro
221

    
222
%macro REVERSE_WORDS_SSSE3 2
223
    pshufb  %1, %2
224
%endmacro
225

    
226
; dst = (dst * src) >> 15
227
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
228
; in from the pmullw result.
229
%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
230
    mova    %3, %1
231
    pmulhw  %1, %2
232
    pmullw  %3, %2
233
    psrlw   %3, 15
234
    psllw   %1, 1
235
    por     %1, %3
236
%endmacro
237

    
238
; dst = ((dst * src) + (1<<14)) >> 15
239
%macro MUL16FIXED_SSSE3 3 ; dst, src, unused
240
    pmulhrsw   %1, %2
241
%endmacro
242

    
243
%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
244
cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
245
    lea     offset2q, [offsetq-mmsize]
246
%if %2
247
    mova          m5, [pd_16384]
248
%elifidn %1, ssse3
249
    mova          m5, [pb_revwords]
250
    ALIGN 16
251
%endif
252
.loop:
253
%if %2
254
    ; This version expands 16-bit to 32-bit, multiplies by the window,
255
    ; adds 16384 for rounding, right shifts 15, then repacks back to words to
256
    ; save to the output. The window is reversed for the second half.
257
    mova          m3, [windowq+offset2q]
258
    mova          m4, [ inputq+offset2q]
259
    pxor          m0, m0
260
    punpcklwd     m0, m3
261
    punpcklwd     m1, m4
262
    pmaddwd       m0, m1
263
    paddd         m0, m5
264
    psrad         m0, 15
265
    pxor          m2, m2
266
    punpckhwd     m2, m3
267
    punpckhwd     m1, m4
268
    pmaddwd       m2, m1
269
    paddd         m2, m5
270
    psrad         m2, 15
271
    packssdw      m0, m2
272
    mova  [outputq+offset2q], m0
273
    REVERSE_WORDS m3
274
    mova          m4, [ inputq+offsetq]
275
    pxor          m0, m0
276
    punpcklwd     m0, m3
277
    punpcklwd     m1, m4
278
    pmaddwd       m0, m1
279
    paddd         m0, m5
280
    psrad         m0, 15
281
    pxor          m2, m2
282
    punpckhwd     m2, m3
283
    punpckhwd     m1, m4
284
    pmaddwd       m2, m1
285
    paddd         m2, m5
286
    psrad         m2, 15
287
    packssdw      m0, m2
288
    mova  [outputq+offsetq], m0
289
%elif %3
290
    ; This version does the 16x16->16 multiplication in-place without expanding
291
    ; to 32-bit. The ssse3 version is bit-identical.
292
    mova          m0, [windowq+offset2q]
293
    mova          m1, [ inputq+offset2q]
294
    pmulhrsw      m1, m0
295
    REVERSE_WORDS m0, m5
296
    pmulhrsw      m0, [ inputq+offsetq ]
297
    mova  [outputq+offset2q], m1
298
    mova  [outputq+offsetq ], m0
299
%else
300
    ; This version does the 16x16->16 multiplication in-place without expanding
301
    ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
302
    ; therefore are not bit-identical to the C version.
303
    mova          m0, [windowq+offset2q]
304
    mova          m1, [ inputq+offset2q]
305
    mova          m2, [ inputq+offsetq ]
306
    MUL16FIXED    m1, m0, m3
307
    REVERSE_WORDS m0
308
    MUL16FIXED    m2, m0, m3
309
    mova  [outputq+offset2q], m1
310
    mova  [outputq+offsetq ], m2
311
%endif
312
    add      offsetd, mmsize
313
    sub     offset2d, mmsize
314
    jae .loop
315
    REP_RET
316
%endmacro
317

    
318
INIT_MMX
319
%define REVERSE_WORDS REVERSE_WORDS_MMXEXT
320
%define MUL16FIXED MUL16FIXED_MMXEXT
321
APPLY_WINDOW_INT16 mmxext,     0, 0
322
APPLY_WINDOW_INT16 mmxext_ba,  1, 0
323
INIT_XMM
324
%define REVERSE_WORDS REVERSE_WORDS_SSE2
325
APPLY_WINDOW_INT16 sse2,       0, 0
326
APPLY_WINDOW_INT16 sse2_ba,    1, 0
327
APPLY_WINDOW_INT16 ssse3_atom, 0, 1
328
%define REVERSE_WORDS REVERSE_WORDS_SSSE3
329
APPLY_WINDOW_INT16 ssse3,      0, 1
330

    
331

    
332
; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
333
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
334
    movq    mm0, [topq]
335
    movq    mm2, mm0
336
    movd    mm4, [left_topq]
337
    psllq   mm2, 8
338
    movq    mm1, mm0
339
    por     mm4, mm2
340
    movd    mm3, [leftq]
341
    psubb   mm0, mm4 ; t-tl
342
    add    dstq, wq
343
    add    topq, wq
344
    add   diffq, wq
345
    neg      wq
346
    jmp .skip
347
.loop:
348
    movq    mm4, [topq+wq]
349
    movq    mm0, mm4
350
    psllq   mm4, 8
351
    por     mm4, mm1
352
    movq    mm1, mm0 ; t
353
    psubb   mm0, mm4 ; t-tl
354
.skip:
355
    movq    mm2, [diffq+wq]
356
%assign i 0
357
%rep 8
358
    movq    mm4, mm0
359
    paddb   mm4, mm3 ; t-tl+l
360
    movq    mm5, mm3
361
    pmaxub  mm3, mm1
362
    pminub  mm5, mm1
363
    pminub  mm3, mm4
364
    pmaxub  mm3, mm5 ; median
365
    paddb   mm3, mm2 ; +residual
366
%if i==0
367
    movq    mm7, mm3
368
    psllq   mm7, 56
369
%else
370
    movq    mm6, mm3
371
    psrlq   mm7, 8
372
    psllq   mm6, 56
373
    por     mm7, mm6
374
%endif
375
%if i<7
376
    psrlq   mm0, 8
377
    psrlq   mm1, 8
378
    psrlq   mm2, 8
379
%endif
380
%assign i i+1
381
%endrep
382
    movq [dstq+wq], mm7
383
    add      wq, 8
384
    jl .loop
385
    movzx   r2d, byte [dstq-1]
386
    mov [leftq], r2d
387
    movzx   r2d, byte [topq-1]
388
    mov [left_topq], r2d
389
    RET
390

    
391

    
392
%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
393
    add     srcq, wq
394
    add     dstq, wq
395
    neg     wq
396
%%.loop:
397
    mova    m1, [srcq+wq]
398
    mova    m2, m1
399
    psllw   m1, 8
400
    paddb   m1, m2
401
    mova    m2, m1
402
    pshufb  m1, m3
403
    paddb   m1, m2
404
    pshufb  m0, m5
405
    mova    m2, m1
406
    pshufb  m1, m4
407
    paddb   m1, m2
408
%if mmsize == 16
409
    mova    m2, m1
410
    pshufb  m1, m6
411
    paddb   m1, m2
412
%endif
413
    paddb   m0, m1
414
%if %1
415
    mova    [dstq+wq], m0
416
%else
417
    movq    [dstq+wq], m0
418
    movhps  [dstq+wq+8], m0
419
%endif
420
    add     wq, mmsize
421
    jl %%.loop
422
    mov     eax, mmsize-1
423
    sub     eax, wd
424
    movd    m1, eax
425
    pshufb  m0, m1
426
    movd    eax, m0
427
    RET
428
%endmacro
429

    
430
; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
431
INIT_MMX
432
cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
433
.skip_prologue:
434
    mova    m5, [pb_7]
435
    mova    m4, [pb_zzzz3333zzzzbbbb]
436
    mova    m3, [pb_zz11zz55zz99zzdd]
437
    movd    m0, leftm
438
    psllq   m0, 56
439
    ADD_HFYU_LEFT_LOOP 1
440

    
441
INIT_XMM
442
cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
443
    mova    m5, [pb_f]
444
    mova    m6, [pb_zzzzzzzz77777777]
445
    mova    m4, [pb_zzzz3333zzzzbbbb]
446
    mova    m3, [pb_zz11zz55zz99zzdd]
447
    movd    m0, leftm
448
    pslldq  m0, 15
449
    test    srcq, 15
450
    jnz add_hfyu_left_prediction_ssse3.skip_prologue
451
    test    dstq, 15
452
    jnz .unaligned
453
    ADD_HFYU_LEFT_LOOP 1
454
.unaligned:
455
    ADD_HFYU_LEFT_LOOP 0
456

    
457

    
458
; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
459
cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
460
    neg offsetq
461
    shl offsetq, 2
462
    sub v1q, offsetq
463
    sub v2q, offsetq
464
    xorps xmm0, xmm0
465
    .loop:
466
        movaps   xmm1, [v1q+offsetq]
467
        mulps    xmm1, [v2q+offsetq]
468
        addps    xmm0, xmm1
469
        add      offsetq, 16
470
        js       .loop
471
    movhlps xmm1, xmm0
472
    addps   xmm0, xmm1
473
    movss   xmm1, xmm0
474
    shufps  xmm0, xmm0, 1
475
    addss   xmm0, xmm1
476
%ifndef ARCH_X86_64
477
    movd    r0m,  xmm0
478
    fld     dword r0m
479
%endif
480
    RET
481

    
482
; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
483
;                              x86_reg start_y, x86_reg end_y, x86_reg block_h,
484
;                              x86_reg start_x, x86_reg end_x, x86_reg block_w);
485
;
486
; The actual function itself is below. It basically wraps a very simple
487
; w = end_x - start_x
488
; if (w) {
489
;   if (w > 22) {
490
;     jump to the slow loop functions
491
;   } else {
492
;     jump to the fast loop functions
493
;   }
494
; }
495
;
496
; ... and then the same for left/right extend also. See below for loop
497
; function implementations. Fast are fixed-width, slow is variable-width
498

    
499
%macro EMU_EDGE_FUNC 1
500
%ifdef ARCH_X86_64
501
%define w_reg r10
502
cglobal emu_edge_core_%1, 6, 7, 1
503
    mov        r11, r5          ; save block_h
504
%else
505
%define w_reg r6
506
cglobal emu_edge_core_%1, 2, 7, 0
507
    mov         r4, r4m         ; end_y
508
    mov         r5, r5m         ; block_h
509
%endif
510

    
511
    ; start with vertical extend (top/bottom) and body pixel copy
512
    mov      w_reg, r7m
513
    sub      w_reg, r6m         ; w = start_x - end_x
514
    sub         r5, r4
515
%ifdef ARCH_X86_64
516
    sub         r4, r3
517
%else
518
    sub         r4, dword r3m
519
%endif
520
    cmp      w_reg, 22
521
    jg .slow_v_extend_loop
522
%ifdef ARCH_X86_32
523
    mov         r2, r2m         ; linesize
524
%endif
525
    sal      w_reg, 7           ; w * 128
526
%ifdef PIC
527
    lea        rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
528
    add      w_reg, rax
529
%else
530
    lea      w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
531
%endif
532
    call     w_reg              ; fast top extend, body copy and bottom extend
533
.v_extend_end:
534

    
535
    ; horizontal extend (left/right)
536
    mov      w_reg, r6m         ; start_x
537
    sub         r0, w_reg
538
%ifdef ARCH_X86_64
539
    mov         r3, r0          ; backup of buf+block_h*linesize
540
    mov         r5, r11
541
%else
542
    mov        r0m, r0          ; backup of buf+block_h*linesize
543
    mov         r5, r5m
544
%endif
545
    test     w_reg, w_reg
546
    jz .right_extend
547
    cmp      w_reg, 22
548
    jg .slow_left_extend_loop
549
    mov         r1, w_reg
550
    dec      w_reg
551
    ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
552
    sar      w_reg, 1
553
    sal      w_reg, 6
554
    ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
555
    ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
556
%ifdef PIC
557
    lea        rax, [.emuedge_extend_left_2]
558
    add      w_reg, rax
559
%else
560
    lea      w_reg, [.emuedge_extend_left_2+w_reg]
561
%endif
562
    call     w_reg
563

    
564
    ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
565
.right_extend:
566
%ifdef ARCH_X86_32
567
    mov         r0, r0m
568
    mov         r5, r5m
569
%endif
570
    mov      w_reg, r7m         ; end_x
571
    mov         r1, r8m         ; block_w
572
    mov         r4, r1
573
    sub         r1, w_reg
574
    jz .h_extend_end            ; if (end_x == block_w) goto h_extend_end
575
    cmp         r1, 22
576
    jg .slow_right_extend_loop
577
    dec         r1
578
    ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
579
    sar         r1, 1
580
    sal         r1, 6
581
%ifdef PIC
582
    lea        rax, [.emuedge_extend_right_2]
583
    add         r1, rax
584
%else
585
    lea         r1, [.emuedge_extend_right_2+r1]
586
%endif
587
    call        r1
588
.h_extend_end:
589
    RET
590

    
591
%ifdef ARCH_X86_64
592
%define vall  al
593
%define valh  ah
594
%define valw  ax
595
%define valw2 r10w
596
%define valw3 r3w
597
%ifdef WIN64
598
%define valw4 r4w
599
%else ; unix64
600
%define valw4 r3w
601
%endif
602
%define vald eax
603
%else
604
%define vall  bl
605
%define valh  bh
606
%define valw  bx
607
%define valw2 r6w
608
%define valw3 valw2
609
%define valw4 valw3
610
%define vald ebx
611
%define stack_offset 0x14
612
%endif
613

    
614
%endmacro
615

    
616
; macro to read/write a horizontal number of pixels (%2) to/from registers
617
; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
618
;            - if (%2 & 15 == 8) fills the last 8 bytes into rax
619
;            - else if (%2 & 8)  fills 8 bytes into mm0
620
;            - if (%2 & 7 == 4)  fills the last 4 bytes into rax
621
;            - else if (%2 & 4)  fills 4 bytes into mm0-1
622
;            - if (%2 & 3 == 3)  fills 2 bytes into r10/r3, and 1 into eax
623
;              (note that we're using r3 for body/bottom because it's a shorter
624
;               opcode, and then the loop fits in 128 bytes)
625
;            - else              fills remaining bytes into rax
626
; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
627
;            - if (%2 & 7 == 4)  fills 4 bytes into ebx
628
;            - else if (%2 & 4)  fills 4 bytes into mm0-7
629
;            - if (%2 & 3 == 3)  fills 2 bytes into r6, and 1 into ebx
630
;            - else              fills remaining bytes into ebx
631
; writing data out is in the same way
632
%macro READ_NUM_BYTES 3
633
%assign %%src_off 0 ; offset in source buffer
634
%assign %%smidx   0 ; mmx register idx
635
%assign %%sxidx   0 ; xmm register idx
636

    
637
%ifnidn %3, mmx
638
%rep %2/16
639
    movdqu xmm %+ %%sxidx, [r1+%%src_off]
640
%assign %%src_off %%src_off+16
641
%assign %%sxidx   %%sxidx+1
642
%endrep ; %2/16
643
%endif ; !mmx
644

    
645
%ifdef ARCH_X86_64
646
%if (%2-%%src_off) == 8
647
    mov           rax, [r1+%%src_off]
648
%assign %%src_off %%src_off+8
649
%endif ; (%2-%%src_off) == 8
650
%endif ; x86-64
651

    
652
%rep (%2-%%src_off)/8
653
    movq    mm %+ %%smidx, [r1+%%src_off]
654
%assign %%src_off %%src_off+8
655
%assign %%smidx   %%smidx+1
656
%endrep ; (%2-%%dst_off)/8
657

    
658
%if (%2-%%src_off) == 4
659
    mov          vald, [r1+%%src_off]
660
%elif (%2-%%src_off) & 4
661
    movd    mm %+ %%smidx, [r1+%%src_off]
662
%assign %%src_off %%src_off+4
663
%endif ; (%2-%%src_off) ==/& 4
664

    
665
%if (%2-%%src_off) == 1
666
    mov          vall, [r1+%%src_off]
667
%elif (%2-%%src_off) == 2
668
    mov          valw, [r1+%%src_off]
669
%elif (%2-%%src_off) == 3
670
%ifidn %1, top
671
    mov         valw2, [r1+%%src_off]
672
%elifidn %1, body
673
    mov         valw3, [r1+%%src_off]
674
%elifidn %1, bottom
675
    mov         valw4, [r1+%%src_off]
676
%endif ; %1 ==/!= top
677
    mov          vall, [r1+%%src_off+2]
678
%endif ; (%2-%%src_off) == 1/2/3
679
%endmacro ; READ_NUM_BYTES
680

    
681
%macro WRITE_NUM_BYTES 3
682
%assign %%dst_off 0 ; offset in destination buffer
683
%assign %%dmidx   0 ; mmx register idx
684
%assign %%dxidx   0 ; xmm register idx
685

    
686
%ifnidn %3, mmx
687
%rep %2/16
688
    movdqu [r0+%%dst_off], xmm %+ %%dxidx
689
%assign %%dst_off %%dst_off+16
690
%assign %%dxidx   %%dxidx+1
691
%endrep ; %2/16
692
%endif
693

    
694
%ifdef ARCH_X86_64
695
%if (%2-%%dst_off) == 8
696
    mov    [r0+%%dst_off], rax
697
%assign %%dst_off %%dst_off+8
698
%endif ; (%2-%%dst_off) == 8
699
%endif ; x86-64
700

    
701
%rep (%2-%%dst_off)/8
702
    movq   [r0+%%dst_off], mm %+ %%dmidx
703
%assign %%dst_off %%dst_off+8
704
%assign %%dmidx   %%dmidx+1
705
%endrep ; (%2-%%dst_off)/8
706

    
707
%if (%2-%%dst_off) == 4
708
    mov    [r0+%%dst_off], vald
709
%elif (%2-%%dst_off) & 4
710
    movd   [r0+%%dst_off], mm %+ %%dmidx
711
%assign %%dst_off %%dst_off+4
712
%endif ; (%2-%%dst_off) ==/& 4
713

    
714
%if (%2-%%dst_off) == 1
715
    mov    [r0+%%dst_off], vall
716
%elif (%2-%%dst_off) == 2
717
    mov    [r0+%%dst_off], valw
718
%elif (%2-%%dst_off) == 3
719
%ifidn %1, top
720
    mov    [r0+%%dst_off], valw2
721
%elifidn %1, body
722
    mov    [r0+%%dst_off], valw3
723
%elifidn %1, bottom
724
    mov    [r0+%%dst_off], valw4
725
%endif ; %1 ==/!= top
726
    mov  [r0+%%dst_off+2], vall
727
%endif ; (%2-%%dst_off) == 1/2/3
728
%endmacro ; WRITE_NUM_BYTES
729

    
730
; vertical top/bottom extend and body copy fast loops
731
; these are function pointers to set-width line copy functions, i.e.
732
; they read a fixed number of pixels into set registers, and write
733
; those out into the destination buffer
734
; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
735
; r6(eax/64)/r3(ebx/32)=val_reg
736
%macro VERTICAL_EXTEND 1
737
%assign %%n 1
738
%rep 22
739
ALIGN 128
740
.emuedge_v_extend_ %+ %%n:
741
    ; extend pixels above body
742
%ifdef ARCH_X86_64
743
    test           r3 , r3                   ; if (!start_y)
744
    jz .emuedge_copy_body_ %+ %%n %+ _loop   ;   goto body
745
%else ; ARCH_X86_32
746
    cmp      dword r3m, 0
747
    je .emuedge_copy_body_ %+ %%n %+ _loop
748
%endif ; ARCH_X86_64/32
749
    READ_NUM_BYTES  top,    %%n, %1          ; read bytes
750
.emuedge_extend_top_ %+ %%n %+ _loop:        ; do {
751
    WRITE_NUM_BYTES top,    %%n, %1          ;   write bytes
752
    add            r0 , r2                   ;   dst += linesize
753
%ifdef ARCH_X86_64
754
    dec            r3d
755
%else ; ARCH_X86_32
756
    dec      dword r3m
757
%endif ; ARCH_X86_64/32
758
    jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
759

    
760
    ; copy body pixels
761
.emuedge_copy_body_ %+ %%n %+ _loop:         ; do {
762
    READ_NUM_BYTES  body,   %%n, %1          ;   read bytes
763
    WRITE_NUM_BYTES body,   %%n, %1          ;   write bytes
764
    add            r0 , r2                   ;   dst += linesize
765
    add            r1 , r2                   ;   src += linesize
766
    dec            r4d
767
    jnz .emuedge_copy_body_ %+ %%n %+ _loop  ; } while (--end_y)
768

    
769
    ; copy bottom pixels
770
    test           r5 , r5                   ; if (!block_h)
771
    jz .emuedge_v_extend_end_ %+ %%n         ;   goto end
772
    sub            r1 , r2                   ; src -= linesize
773
    READ_NUM_BYTES  bottom, %%n, %1          ; read bytes
774
.emuedge_extend_bottom_ %+ %%n %+ _loop:     ; do {
775
    WRITE_NUM_BYTES bottom, %%n, %1          ;   write bytes
776
    add            r0 , r2                   ;   dst += linesize
777
    dec            r5d
778
    jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
779

    
780
.emuedge_v_extend_end_ %+ %%n:
781
%ifdef ARCH_X86_64
782
    ret
783
%else ; ARCH_X86_32
784
    rep ret
785
%endif ; ARCH_X86_64/32
786
%assign %%n %%n+1
787
%endrep
788
%endmacro VERTICAL_EXTEND
789

    
790
; left/right (horizontal) fast extend functions
791
; these are essentially identical to the vertical extend ones above,
792
; just left/right separated because number of pixels to extend is
793
; obviously not the same on both sides.
794
; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
795
; lowest two bytes of the register (so val*0x0101), and are splatted
796
; into each byte of mm0 as well if n_pixels >= 8
797

    
798
%macro READ_V_PIXEL 3
799
    mov        vall, %2
800
    mov        valh, vall
801
%if %1 >= 8
802
    movd        mm0, vald
803
%ifidn %3, mmx
804
    punpcklwd   mm0, mm0
805
    punpckldq   mm0, mm0
806
%else ; !mmx
807
    pshufw      mm0, mm0, 0
808
%endif ; mmx
809
%endif ; %1 >= 8
810
%endmacro
811

    
812
%macro WRITE_V_PIXEL 2
813
%assign %%dst_off 0
814
%rep %1/8
815
    movq [%2+%%dst_off], mm0
816
%assign %%dst_off %%dst_off+8
817
%endrep
818
%if %1 & 4
819
%if %1 >= 8
820
    movd [%2+%%dst_off], mm0
821
%else ; %1 < 8
822
    mov  [%2+%%dst_off]  , valw
823
    mov  [%2+%%dst_off+2], valw
824
%endif ; %1 >=/< 8
825
%assign %%dst_off %%dst_off+4
826
%endif ; %1 & 4
827
%if %1&2
828
    mov  [%2+%%dst_off], valw
829
%endif ; %1 & 2
830
%endmacro
831

    
832
; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
833
%macro LEFT_EXTEND 1
834
%assign %%n 2
835
%rep 11
836
ALIGN 64
837
.emuedge_extend_left_ %+ %%n:          ; do {
838
    sub         r0, r2                 ;   dst -= linesize
839
    READ_V_PIXEL  %%n, [r0+r1], %1     ;   read pixels
840
    WRITE_V_PIXEL %%n, r0              ;   write pixels
841
    dec         r5
842
    jnz .emuedge_extend_left_ %+ %%n   ; } while (--block_h)
843
%ifdef ARCH_X86_64
844
    ret
845
%else ; ARCH_X86_32
846
    rep ret
847
%endif ; ARCH_X86_64/32
848
%assign %%n %%n+2
849
%endrep
850
%endmacro ; LEFT_EXTEND
851

    
852
; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
853
%macro RIGHT_EXTEND 1
854
%assign %%n 2
855
%rep 11
856
ALIGN 64
857
.emuedge_extend_right_ %+ %%n:          ; do {
858
%ifdef ARCH_X86_64
859
    sub        r3, r2                   ;   dst -= linesize
860
    READ_V_PIXEL  %%n, [r3+w_reg-1], %1 ;   read pixels
861
    WRITE_V_PIXEL %%n, r3+r4-%%n        ;   write pixels
862
    dec       r11
863
%else ; ARCH_X86_32
864
    sub        r0, r2                   ;   dst -= linesize
865
    READ_V_PIXEL  %%n, [r0+w_reg-1], %1 ;   read pixels
866
    WRITE_V_PIXEL %%n, r0+r4-%%n        ;   write pixels
867
    dec     r5
868
%endif ; ARCH_X86_64/32
869
    jnz .emuedge_extend_right_ %+ %%n   ; } while (--block_h)
870
%ifdef ARCH_X86_64
871
    ret
872
%else ; ARCH_X86_32
873
    rep ret
874
%endif ; ARCH_X86_64/32
875
%assign %%n %%n+2
876
%endrep
877

    
878
%ifdef ARCH_X86_32
879
%define stack_offset 0x10
880
%endif
881
%endmacro ; RIGHT_EXTEND
882

    
883
; below follow the "slow" copy/extend functions, these act on a non-fixed
884
; width specified in a register, and run a loop to copy the full amount
885
; of bytes. They are optimized for copying of large amounts of pixels per
886
; line, so they unconditionally splat data into mm registers to copy 8
887
; bytes per loop iteration. It could be considered to use xmm for x86-64
888
; also, but I haven't optimized this as much (i.e. FIXME)
889
%macro V_COPY_NPX 4-5
890
%if %0 == 4
891
    test     w_reg, %4
892
    jz .%1_skip_%4_px
893
%else ; %0 == 5
894
.%1_%4_px_loop:
895
%endif
896
    %3          %2, [r1+cnt_reg]
897
    %3 [r0+cnt_reg], %2
898
    add    cnt_reg, %4
899
%if %0 == 5
900
    sub      w_reg, %4
901
    test     w_reg, %5
902
    jnz .%1_%4_px_loop
903
%endif
904
.%1_skip_%4_px:
905
%endmacro
906

    
907
%macro V_COPY_ROW 3
908
%ifidn %1, bottom
909
    sub         r1, linesize
910
%endif
911
.%1_copy_loop:
912
    xor    cnt_reg, cnt_reg
913
%ifidn %3, mmx
914
%define linesize r2m
915
    V_COPY_NPX %1,  mm0, movq,    8, 0xFFFFFFF8
916
%else ; !mmx
917
    V_COPY_NPX %1, xmm0, movdqu, 16, 0xFFFFFFF0
918
%ifdef ARCH_X86_64
919
%define linesize r2
920
    V_COPY_NPX %1, rax , mov,     8
921
%else ; ARCH_X86_32
922
%define linesize r2m
923
    V_COPY_NPX %1,  mm0, movq,    8
924
%endif ; ARCH_X86_64/32
925
%endif ; mmx
926
    V_COPY_NPX %1, vald, mov,     4
927
    V_COPY_NPX %1, valw, mov,     2
928
    V_COPY_NPX %1, vall, mov,     1
929
    mov      w_reg, cnt_reg
930
%ifidn %1, body
931
    add         r1, linesize
932
%endif
933
    add         r0, linesize
934
    dec         %2
935
    jnz .%1_copy_loop
936
%endmacro
937

    
938
%macro SLOW_V_EXTEND 1
939
.slow_v_extend_loop:
940
; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
941
; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
942
%ifdef ARCH_X86_64
943
    push       r11              ; save old value of block_h
944
    test        r3, r3
945
%define cnt_reg r11
946
    jz .do_body_copy            ; if (!start_y) goto do_body_copy
947
    V_COPY_ROW top, r3, %1
948
%else
949
    cmp  dword r3m, 0
950
%define cnt_reg r2
951
    je .do_body_copy            ; if (!start_y) goto do_body_copy
952
    V_COPY_ROW top, dword r3m, %1
953
%endif
954

    
955
.do_body_copy:
956
    V_COPY_ROW body, r4, %1
957

    
958
%ifdef ARCH_X86_64
959
    pop        r11              ; restore old value of block_h
960
%define cnt_reg r3
961
%endif
962
    test        r5, r5
963
%ifdef ARCH_X86_64
964
    jz .v_extend_end
965
%else
966
    jz .skip_bottom_extend
967
%endif
968
    V_COPY_ROW bottom, r5, %1
969
%ifdef ARCH_X86_32
970
.skip_bottom_extend:
971
    mov         r2, r2m
972
%endif
973
    jmp .v_extend_end
974
%endmacro
975

    
976
%macro SLOW_LEFT_EXTEND 1
977
.slow_left_extend_loop:
978
; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
979
    mov         r4, 8
980
    sub         r0, linesize
981
    READ_V_PIXEL 8, [r0+w_reg], %1
982
.left_extend_8px_loop:
983
    movq [r0+r4-8], mm0
984
    add         r4, 8
985
    cmp         r4, w_reg
986
    jle .left_extend_8px_loop
987
    sub         r4, 8
988
    cmp         r4, w_reg
989
    jge .left_extend_loop_end
990
.left_extend_2px_loop:
991
    mov    [r0+r4], valw
992
    add         r4, 2
993
    cmp         r4, w_reg
994
    jl .left_extend_2px_loop
995
.left_extend_loop_end:
996
    dec         r5
997
    jnz .slow_left_extend_loop
998
%ifdef ARCH_X86_32
999
    mov         r2, r2m
1000
%endif
1001
    jmp .right_extend
1002
%endmacro
1003

    
1004
%macro SLOW_RIGHT_EXTEND 1
1005
.slow_right_extend_loop:
1006
; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
1007
; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
1008
%ifdef ARCH_X86_64
1009
%define buf_reg r3
1010
%define bh_reg r11
1011
%else
1012
%define buf_reg r0
1013
%define bh_reg r5
1014
%endif
1015
    lea         r1, [r4-8]
1016
    sub    buf_reg, linesize
1017
    READ_V_PIXEL 8, [buf_reg+w_reg-1], %1
1018
.right_extend_8px_loop:
1019
    movq [buf_reg+r1], mm0
1020
    sub         r1, 8
1021
    cmp         r1, w_reg
1022
    jge .right_extend_8px_loop
1023
    add         r1, 8
1024
    cmp         r1, w_reg
1025
    je .right_extend_loop_end
1026
.right_extend_2px_loop:
1027
    sub         r1, 2
1028
    mov [buf_reg+r1], valw
1029
    cmp         r1, w_reg
1030
    jg .right_extend_2px_loop
1031
.right_extend_loop_end:
1032
    dec         bh_reg
1033
    jnz .slow_right_extend_loop
1034
    jmp .h_extend_end
1035
%endmacro
1036

    
1037
%macro emu_edge 1
1038
EMU_EDGE_FUNC     %1
1039
VERTICAL_EXTEND   %1
1040
LEFT_EXTEND       %1
1041
RIGHT_EXTEND      %1
1042
SLOW_V_EXTEND     %1
1043
SLOW_LEFT_EXTEND  %1
1044
SLOW_RIGHT_EXTEND %1
1045
%endmacro
1046

    
1047
emu_edge sse
1048
%ifdef ARCH_X86_32
1049
emu_edge mmx
1050
%endif