ffmpeg / libavcodec / x86 / dsputil_yasm.asm @ fe2ff6d2
History | View | Annotate | Download (23.7 KB)
1 |
;****************************************************************************** |
---|---|
2 |
;* MMX optimized DSP utils |
3 |
;* Copyright (c) 2008 Loren Merritt |
4 |
;* |
5 |
;* This file is part of FFmpeg. |
6 |
;* |
7 |
;* FFmpeg is free software; you can redistribute it and/or |
8 |
;* modify it under the terms of the GNU Lesser General Public |
9 |
;* License as published by the Free Software Foundation; either |
10 |
;* version 2.1 of the License, or (at your option) any later version. |
11 |
;* |
12 |
;* FFmpeg is distributed in the hope that it will be useful, |
13 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 |
;* Lesser General Public License for more details. |
16 |
;* |
17 |
;* You should have received a copy of the GNU Lesser General Public |
18 |
;* License along with FFmpeg; if not, write to the Free Software |
19 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 |
;****************************************************************************** |
21 |
|
22 |
%include "x86inc.asm" |
23 |
|
24 |
SECTION_RODATA |
25 |
pb_f: times 16 db 15 |
26 |
pb_zzzzzzzz77777777: times 8 db -1 |
27 |
pb_7: times 8 db 7 |
28 |
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 |
29 |
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 |
30 |
|
31 |
section .text align=16 |
32 |
|
33 |
%macro SCALARPRODUCT 1 |
34 |
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) |
35 |
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift |
36 |
shl orderq, 1 |
37 |
add v1q, orderq |
38 |
add v2q, orderq |
39 |
neg orderq |
40 |
movd m3, shiftm |
41 |
pxor m2, m2 |
42 |
.loop: |
43 |
movu m0, [v1q + orderq] |
44 |
movu m1, [v1q + orderq + mmsize] |
45 |
pmaddwd m0, [v2q + orderq] |
46 |
pmaddwd m1, [v2q + orderq + mmsize] |
47 |
paddd m2, m0 |
48 |
paddd m2, m1 |
49 |
add orderq, mmsize*2 |
50 |
jl .loop |
51 |
%if mmsize == 16 |
52 |
movhlps m0, m2 |
53 |
paddd m2, m0 |
54 |
psrad m2, m3 |
55 |
pshuflw m0, m2, 0x4e |
56 |
%else |
57 |
psrad m2, m3 |
58 |
pshufw m0, m2, 0x4e |
59 |
%endif |
60 |
paddd m2, m0 |
61 |
movd eax, m2 |
62 |
RET |
63 |
|
64 |
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) |
65 |
cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul |
66 |
shl orderq, 1 |
67 |
movd m7, mulm |
68 |
%if mmsize == 16 |
69 |
pshuflw m7, m7, 0 |
70 |
punpcklqdq m7, m7 |
71 |
%else |
72 |
pshufw m7, m7, 0 |
73 |
%endif |
74 |
pxor m6, m6 |
75 |
add v1q, orderq |
76 |
add v2q, orderq |
77 |
add v3q, orderq |
78 |
neg orderq |
79 |
.loop: |
80 |
movu m0, [v2q + orderq] |
81 |
movu m1, [v2q + orderq + mmsize] |
82 |
mova m4, [v1q + orderq] |
83 |
mova m5, [v1q + orderq + mmsize] |
84 |
movu m2, [v3q + orderq] |
85 |
movu m3, [v3q + orderq + mmsize] |
86 |
pmaddwd m0, m4 |
87 |
pmaddwd m1, m5 |
88 |
pmullw m2, m7 |
89 |
pmullw m3, m7 |
90 |
paddd m6, m0 |
91 |
paddd m6, m1 |
92 |
paddw m2, m4 |
93 |
paddw m3, m5 |
94 |
mova [v1q + orderq], m2 |
95 |
mova [v1q + orderq + mmsize], m3 |
96 |
add orderq, mmsize*2 |
97 |
jl .loop |
98 |
%if mmsize == 16 |
99 |
movhlps m0, m6 |
100 |
paddd m6, m0 |
101 |
pshuflw m0, m6, 0x4e |
102 |
%else |
103 |
pshufw m0, m6, 0x4e |
104 |
%endif |
105 |
paddd m6, m0 |
106 |
movd eax, m6 |
107 |
RET |
108 |
%endmacro |
109 |
|
110 |
INIT_MMX |
111 |
SCALARPRODUCT mmx2 |
112 |
INIT_XMM |
113 |
SCALARPRODUCT sse2 |
114 |
|
115 |
%macro SCALARPRODUCT_LOOP 1 |
116 |
align 16 |
117 |
.loop%1: |
118 |
sub orderq, mmsize*2 |
119 |
%if %1 |
120 |
mova m1, m4 |
121 |
mova m4, [v2q + orderq] |
122 |
mova m0, [v2q + orderq + mmsize] |
123 |
palignr m1, m0, %1 |
124 |
palignr m0, m4, %1 |
125 |
mova m3, m5 |
126 |
mova m5, [v3q + orderq] |
127 |
mova m2, [v3q + orderq + mmsize] |
128 |
palignr m3, m2, %1 |
129 |
palignr m2, m5, %1 |
130 |
%else |
131 |
mova m0, [v2q + orderq] |
132 |
mova m1, [v2q + orderq + mmsize] |
133 |
mova m2, [v3q + orderq] |
134 |
mova m3, [v3q + orderq + mmsize] |
135 |
%endif |
136 |
%define t0 [v1q + orderq] |
137 |
%define t1 [v1q + orderq + mmsize] |
138 |
%ifdef ARCH_X86_64 |
139 |
mova m8, t0 |
140 |
mova m9, t1 |
141 |
%define t0 m8 |
142 |
%define t1 m9 |
143 |
%endif |
144 |
pmaddwd m0, t0 |
145 |
pmaddwd m1, t1 |
146 |
pmullw m2, m7 |
147 |
pmullw m3, m7 |
148 |
paddw m2, t0 |
149 |
paddw m3, t1 |
150 |
paddd m6, m0 |
151 |
paddd m6, m1 |
152 |
mova [v1q + orderq], m2 |
153 |
mova [v1q + orderq + mmsize], m3 |
154 |
jg .loop%1 |
155 |
%if %1 |
156 |
jmp .end |
157 |
%endif |
158 |
%endmacro |
159 |
|
160 |
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) |
161 |
cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul |
162 |
shl orderq, 1 |
163 |
movd m7, mulm |
164 |
pshuflw m7, m7, 0 |
165 |
punpcklqdq m7, m7 |
166 |
pxor m6, m6 |
167 |
mov r4d, v2d |
168 |
and r4d, 15 |
169 |
and v2q, ~15 |
170 |
and v3q, ~15 |
171 |
mova m4, [v2q + orderq] |
172 |
mova m5, [v3q + orderq] |
173 |
; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) |
174 |
cmp r4d, 0 |
175 |
je .loop0 |
176 |
cmp r4d, 2 |
177 |
je .loop2 |
178 |
cmp r4d, 4 |
179 |
je .loop4 |
180 |
cmp r4d, 6 |
181 |
je .loop6 |
182 |
cmp r4d, 8 |
183 |
je .loop8 |
184 |
cmp r4d, 10 |
185 |
je .loop10 |
186 |
cmp r4d, 12 |
187 |
je .loop12 |
188 |
SCALARPRODUCT_LOOP 14 |
189 |
SCALARPRODUCT_LOOP 12 |
190 |
SCALARPRODUCT_LOOP 10 |
191 |
SCALARPRODUCT_LOOP 8 |
192 |
SCALARPRODUCT_LOOP 6 |
193 |
SCALARPRODUCT_LOOP 4 |
194 |
SCALARPRODUCT_LOOP 2 |
195 |
SCALARPRODUCT_LOOP 0 |
196 |
.end: |
197 |
movhlps m0, m6 |
198 |
paddd m6, m0 |
199 |
pshuflw m0, m6, 0x4e |
200 |
paddd m6, m0 |
201 |
movd eax, m6 |
202 |
RET |
203 |
|
204 |
|
205 |
|
206 |
; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) |
207 |
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top |
208 |
movq mm0, [topq] |
209 |
movq mm2, mm0 |
210 |
movd mm4, [left_topq] |
211 |
psllq mm2, 8 |
212 |
movq mm1, mm0 |
213 |
por mm4, mm2 |
214 |
movd mm3, [leftq] |
215 |
psubb mm0, mm4 ; t-tl |
216 |
add dstq, wq |
217 |
add topq, wq |
218 |
add diffq, wq |
219 |
neg wq |
220 |
jmp .skip |
221 |
.loop: |
222 |
movq mm4, [topq+wq] |
223 |
movq mm0, mm4 |
224 |
psllq mm4, 8 |
225 |
por mm4, mm1 |
226 |
movq mm1, mm0 ; t |
227 |
psubb mm0, mm4 ; t-tl |
228 |
.skip: |
229 |
movq mm2, [diffq+wq] |
230 |
%assign i 0 |
231 |
%rep 8 |
232 |
movq mm4, mm0 |
233 |
paddb mm4, mm3 ; t-tl+l |
234 |
movq mm5, mm3 |
235 |
pmaxub mm3, mm1 |
236 |
pminub mm5, mm1 |
237 |
pminub mm3, mm4 |
238 |
pmaxub mm3, mm5 ; median |
239 |
paddb mm3, mm2 ; +residual |
240 |
%if i==0 |
241 |
movq mm7, mm3 |
242 |
psllq mm7, 56 |
243 |
%else |
244 |
movq mm6, mm3 |
245 |
psrlq mm7, 8 |
246 |
psllq mm6, 56 |
247 |
por mm7, mm6 |
248 |
%endif |
249 |
%if i<7 |
250 |
psrlq mm0, 8 |
251 |
psrlq mm1, 8 |
252 |
psrlq mm2, 8 |
253 |
%endif |
254 |
%assign i i+1 |
255 |
%endrep |
256 |
movq [dstq+wq], mm7 |
257 |
add wq, 8 |
258 |
jl .loop |
259 |
movzx r2d, byte [dstq-1] |
260 |
mov [leftq], r2d |
261 |
movzx r2d, byte [topq-1] |
262 |
mov [left_topq], r2d |
263 |
RET |
264 |
|
265 |
|
266 |
%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned |
267 |
add srcq, wq |
268 |
add dstq, wq |
269 |
neg wq |
270 |
%%.loop: |
271 |
mova m1, [srcq+wq] |
272 |
mova m2, m1 |
273 |
psllw m1, 8 |
274 |
paddb m1, m2 |
275 |
mova m2, m1 |
276 |
pshufb m1, m3 |
277 |
paddb m1, m2 |
278 |
pshufb m0, m5 |
279 |
mova m2, m1 |
280 |
pshufb m1, m4 |
281 |
paddb m1, m2 |
282 |
%if mmsize == 16 |
283 |
mova m2, m1 |
284 |
pshufb m1, m6 |
285 |
paddb m1, m2 |
286 |
%endif |
287 |
paddb m0, m1 |
288 |
%if %1 |
289 |
mova [dstq+wq], m0 |
290 |
%else |
291 |
movq [dstq+wq], m0 |
292 |
movhps [dstq+wq+8], m0 |
293 |
%endif |
294 |
add wq, mmsize |
295 |
jl %%.loop |
296 |
mov eax, mmsize-1 |
297 |
sub eax, wd |
298 |
movd m1, eax |
299 |
pshufb m0, m1 |
300 |
movd eax, m0 |
301 |
RET |
302 |
%endmacro |
303 |
|
304 |
; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) |
305 |
INIT_MMX |
306 |
cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left |
307 |
.skip_prologue: |
308 |
mova m5, [pb_7] |
309 |
mova m4, [pb_zzzz3333zzzzbbbb] |
310 |
mova m3, [pb_zz11zz55zz99zzdd] |
311 |
movd m0, leftm |
312 |
psllq m0, 56 |
313 |
ADD_HFYU_LEFT_LOOP 1 |
314 |
|
315 |
INIT_XMM |
316 |
cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left |
317 |
mova m5, [pb_f] |
318 |
mova m6, [pb_zzzzzzzz77777777] |
319 |
mova m4, [pb_zzzz3333zzzzbbbb] |
320 |
mova m3, [pb_zz11zz55zz99zzdd] |
321 |
movd m0, leftm |
322 |
pslldq m0, 15 |
323 |
test srcq, 15 |
324 |
jnz add_hfyu_left_prediction_ssse3.skip_prologue |
325 |
test dstq, 15 |
326 |
jnz .unaligned |
327 |
ADD_HFYU_LEFT_LOOP 1 |
328 |
.unaligned: |
329 |
ADD_HFYU_LEFT_LOOP 0 |
330 |
|
331 |
|
332 |
; float scalarproduct_float_sse(const float *v1, const float *v2, int len) |
333 |
cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset |
334 |
neg offsetq |
335 |
shl offsetq, 2 |
336 |
sub v1q, offsetq |
337 |
sub v2q, offsetq |
338 |
xorps xmm0, xmm0 |
339 |
.loop: |
340 |
movaps xmm1, [v1q+offsetq] |
341 |
mulps xmm1, [v2q+offsetq] |
342 |
addps xmm0, xmm1 |
343 |
add offsetq, 16 |
344 |
js .loop |
345 |
movhlps xmm1, xmm0 |
346 |
addps xmm0, xmm1 |
347 |
movss xmm1, xmm0 |
348 |
shufps xmm0, xmm0, 1 |
349 |
addss xmm0, xmm1 |
350 |
%ifndef ARCH_X86_64 |
351 |
movd r0m, xmm0 |
352 |
fld dword r0m |
353 |
%endif |
354 |
RET |
355 |
|
356 |
; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize, |
357 |
; x86_reg start_y, x86_reg end_y, x86_reg block_h, |
358 |
; x86_reg start_x, x86_reg end_x, x86_reg block_w); |
359 |
; |
360 |
; The actual function itself is below. It basically wraps a very simple |
361 |
; w = end_x - start_x |
362 |
; if (w) { |
363 |
; if (w > 22) { |
364 |
; jump to the slow loop functions |
365 |
; } else { |
366 |
; jump to the fast loop functions |
367 |
; } |
368 |
; } |
369 |
; |
370 |
; ... and then the same for left/right extend also. See below for loop |
371 |
; function implementations. Fast are fixed-width, slow is variable-width |
372 |
|
373 |
%macro EMU_EDGE_FUNC 1 |
374 |
%ifdef ARCH_X86_64 |
375 |
%define w_reg r10 |
376 |
cglobal emu_edge_core_%1, 6, 7, 1 |
377 |
mov r11, r5 ; save block_h |
378 |
%else |
379 |
%define w_reg r6 |
380 |
cglobal emu_edge_core_%1, 2, 7, 0 |
381 |
mov r4, r4m ; end_y |
382 |
mov r5, r5m ; block_h |
383 |
%endif |
384 |
|
385 |
; start with vertical extend (top/bottom) and body pixel copy |
386 |
mov w_reg, r7m |
387 |
sub w_reg, r6m ; w = start_x - end_x |
388 |
sub r5, r4 |
389 |
%ifdef ARCH_X86_64 |
390 |
sub r4, r3 |
391 |
%else |
392 |
sub r4, dword r3m |
393 |
%endif |
394 |
cmp w_reg, 22 |
395 |
jg .slow_v_extend_loop |
396 |
%ifdef ARCH_X86_32 |
397 |
mov r2, r2m ; linesize |
398 |
%endif |
399 |
sal w_reg, 7 ; w * 128 |
400 |
%ifdef PIC |
401 |
lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)] |
402 |
add w_reg, rax |
403 |
%else |
404 |
lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg] |
405 |
%endif |
406 |
call w_reg ; fast top extend, body copy and bottom extend |
407 |
.v_extend_end: |
408 |
|
409 |
; horizontal extend (left/right) |
410 |
mov w_reg, r6m ; start_x |
411 |
sub r0, w_reg |
412 |
%ifdef ARCH_X86_64 |
413 |
mov r3, r0 ; backup of buf+block_h*linesize |
414 |
mov r5, r11 |
415 |
%else |
416 |
mov r0m, r0 ; backup of buf+block_h*linesize |
417 |
mov r5, r5m |
418 |
%endif |
419 |
test w_reg, w_reg |
420 |
jz .right_extend |
421 |
cmp w_reg, 22 |
422 |
jg .slow_left_extend_loop |
423 |
mov r1, w_reg |
424 |
dec w_reg |
425 |
; FIXME we can do a if size == 1 here if that makes any speed difference, test me |
426 |
sar w_reg, 1 |
427 |
sal w_reg, 6 |
428 |
; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs |
429 |
; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h |
430 |
%ifdef PIC |
431 |
lea rax, [.emuedge_extend_left_2] |
432 |
add w_reg, rax |
433 |
%else |
434 |
lea w_reg, [.emuedge_extend_left_2+w_reg] |
435 |
%endif |
436 |
call w_reg |
437 |
|
438 |
; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w |
439 |
.right_extend: |
440 |
%ifdef ARCH_X86_32 |
441 |
mov r0, r0m |
442 |
mov r5, r5m |
443 |
%endif |
444 |
mov w_reg, r7m ; end_x |
445 |
mov r1, r8m ; block_w |
446 |
mov r4, r1 |
447 |
sub r1, w_reg |
448 |
jz .h_extend_end ; if (end_x == block_w) goto h_extend_end |
449 |
cmp r1, 22 |
450 |
jg .slow_right_extend_loop |
451 |
dec r1 |
452 |
; FIXME we can do a if size == 1 here if that makes any speed difference, test me |
453 |
sar r1, 1 |
454 |
sal r1, 6 |
455 |
%ifdef PIC |
456 |
lea rax, [.emuedge_extend_right_2] |
457 |
add r1, rax |
458 |
%else |
459 |
lea r1, [.emuedge_extend_right_2+r1] |
460 |
%endif |
461 |
call r1 |
462 |
.h_extend_end: |
463 |
RET |
464 |
|
465 |
%ifdef ARCH_X86_64 |
466 |
%define vall al |
467 |
%define valh ah |
468 |
%define valw ax |
469 |
%define valw2 r10w |
470 |
%define valw3 r3w |
471 |
%define vald eax |
472 |
%else |
473 |
%define vall bl |
474 |
%define valh bh |
475 |
%define valw bx |
476 |
%define valw2 r6w |
477 |
%define valw3 valw2 |
478 |
%define vald ebx |
479 |
%define stack_offset 0x14 |
480 |
%endif |
481 |
|
482 |
%endmacro |
483 |
|
484 |
; macro to read/write a horizontal number of pixels (%2) to/from registers |
485 |
; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels |
486 |
; - if (%2 & 15 == 8) fills the last 8 bytes into rax |
487 |
; - else if (%2 & 8) fills 8 bytes into mm0 |
488 |
; - if (%2 & 7 == 4) fills the last 4 bytes into rax |
489 |
; - else if (%2 & 4) fills 4 bytes into mm0-1 |
490 |
; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax |
491 |
; (note that we're using r3 for body/bottom because it's a shorter |
492 |
; opcode, and then the loop fits in 128 bytes) |
493 |
; - else fills remaining bytes into rax |
494 |
; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels |
495 |
; - if (%2 & 7 == 4) fills 4 bytes into ebx |
496 |
; - else if (%2 & 4) fills 4 bytes into mm0-7 |
497 |
; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx |
498 |
; - else fills remaining bytes into ebx |
499 |
; writing data out is in the same way |
500 |
%macro READ_NUM_BYTES 3 |
501 |
%assign %%src_off 0 ; offset in source buffer |
502 |
%assign %%smidx 0 ; mmx register idx |
503 |
%assign %%sxidx 0 ; xmm register idx |
504 |
|
505 |
%ifnidn %3, mmx |
506 |
%rep %2/16 |
507 |
movdqu xmm %+ %%sxidx, [r1+%%src_off] |
508 |
%assign %%src_off %%src_off+16 |
509 |
%assign %%sxidx %%sxidx+1 |
510 |
%endrep ; %2/16 |
511 |
%endif ; !mmx |
512 |
|
513 |
%ifdef ARCH_X86_64 |
514 |
%if (%2-%%src_off) == 8 |
515 |
mov rax, [r1+%%src_off] |
516 |
%assign %%src_off %%src_off+8 |
517 |
%endif ; (%2-%%src_off) == 8 |
518 |
%endif ; x86-64 |
519 |
|
520 |
%rep (%2-%%src_off)/8 |
521 |
movq mm %+ %%smidx, [r1+%%src_off] |
522 |
%assign %%src_off %%src_off+8 |
523 |
%assign %%smidx %%smidx+1 |
524 |
%endrep ; (%2-%%dst_off)/8 |
525 |
|
526 |
%if (%2-%%src_off) == 4 |
527 |
mov vald, [r1+%%src_off] |
528 |
%elif (%2-%%src_off) & 4 |
529 |
movd mm %+ %%smidx, [r1+%%src_off] |
530 |
%assign %%src_off %%src_off+4 |
531 |
%endif ; (%2-%%src_off) ==/& 4 |
532 |
|
533 |
%if (%2-%%src_off) == 1 |
534 |
mov vall, [r1+%%src_off] |
535 |
%elif (%2-%%src_off) == 2 |
536 |
mov valw, [r1+%%src_off] |
537 |
%elif (%2-%%src_off) == 3 |
538 |
%ifidn %1, top |
539 |
mov valw2, [r1+%%src_off] |
540 |
%else ; %1 != top |
541 |
mov valw3, [r1+%%src_off] |
542 |
%endif ; %1 ==/!= top |
543 |
mov vall, [r1+%%src_off+2] |
544 |
%endif ; (%2-%%src_off) == 1/2/3 |
545 |
%endmacro ; READ_NUM_BYTES |
546 |
|
547 |
%macro WRITE_NUM_BYTES 3 |
548 |
%assign %%dst_off 0 ; offset in destination buffer |
549 |
%assign %%dmidx 0 ; mmx register idx |
550 |
%assign %%dxidx 0 ; xmm register idx |
551 |
|
552 |
%ifnidn %3, mmx |
553 |
%rep %2/16 |
554 |
movdqu [r0+%%dst_off], xmm %+ %%dxidx |
555 |
%assign %%dst_off %%dst_off+16 |
556 |
%assign %%dxidx %%dxidx+1 |
557 |
%endrep ; %2/16 |
558 |
%endif |
559 |
|
560 |
%ifdef ARCH_X86_64 |
561 |
%if (%2-%%dst_off) == 8 |
562 |
mov [r0+%%dst_off], rax |
563 |
%assign %%dst_off %%dst_off+8 |
564 |
%endif ; (%2-%%dst_off) == 8 |
565 |
%endif ; x86-64 |
566 |
|
567 |
%rep (%2-%%dst_off)/8 |
568 |
movq [r0+%%dst_off], mm %+ %%dmidx |
569 |
%assign %%dst_off %%dst_off+8 |
570 |
%assign %%dmidx %%dmidx+1 |
571 |
%endrep ; (%2-%%dst_off)/8 |
572 |
|
573 |
%if (%2-%%dst_off) == 4 |
574 |
mov [r0+%%dst_off], vald |
575 |
%elif (%2-%%dst_off) & 4 |
576 |
movd [r0+%%dst_off], mm %+ %%dmidx |
577 |
%assign %%dst_off %%dst_off+4 |
578 |
%endif ; (%2-%%dst_off) ==/& 4 |
579 |
|
580 |
%if (%2-%%dst_off) == 1 |
581 |
mov [r0+%%dst_off], vall |
582 |
%elif (%2-%%dst_off) == 2 |
583 |
mov [r0+%%dst_off], valw |
584 |
%elif (%2-%%dst_off) == 3 |
585 |
%ifidn %1, top |
586 |
mov [r0+%%dst_off], valw2 |
587 |
%else ; %1 != top |
588 |
mov [r0+%%dst_off], valw3 |
589 |
%endif ; %1 ==/!= top |
590 |
mov [r0+%%dst_off+2], vall |
591 |
%endif ; (%2-%%dst_off) == 1/2/3 |
592 |
%endmacro ; WRITE_NUM_BYTES |
593 |
|
594 |
; vertical top/bottom extend and body copy fast loops |
595 |
; these are function pointers to set-width line copy functions, i.e. |
596 |
; they read a fixed number of pixels into set registers, and write |
597 |
; those out into the destination buffer |
598 |
; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h |
599 |
; r6(eax/64)/r3(ebx/32)=val_reg |
600 |
%macro VERTICAL_EXTEND 1 |
601 |
%assign %%n 1 |
602 |
%rep 22 |
603 |
ALIGN 128 |
604 |
.emuedge_v_extend_ %+ %%n: |
605 |
; extend pixels above body |
606 |
%ifdef ARCH_X86_64 |
607 |
test r3 , r3 ; if (!start_y) |
608 |
jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body |
609 |
%else ; ARCH_X86_32 |
610 |
cmp dword r3m, 0 |
611 |
je .emuedge_copy_body_ %+ %%n %+ _loop |
612 |
%endif ; ARCH_X86_64/32 |
613 |
READ_NUM_BYTES top, %%n, %1 ; read bytes |
614 |
.emuedge_extend_top_ %+ %%n %+ _loop: ; do { |
615 |
WRITE_NUM_BYTES top, %%n, %1 ; write bytes |
616 |
add r0 , r2 ; dst += linesize |
617 |
%ifdef ARCH_X86_64 |
618 |
dec r3 |
619 |
%else ; ARCH_X86_32 |
620 |
dec dword r3m |
621 |
%endif ; ARCH_X86_64/32 |
622 |
jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y) |
623 |
|
624 |
; copy body pixels |
625 |
.emuedge_copy_body_ %+ %%n %+ _loop: ; do { |
626 |
READ_NUM_BYTES body, %%n, %1 ; read bytes |
627 |
WRITE_NUM_BYTES body, %%n, %1 ; write bytes |
628 |
add r0 , r2 ; dst += linesize |
629 |
add r1 , r2 ; src += linesize |
630 |
dec r4 |
631 |
jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y) |
632 |
|
633 |
; copy bottom pixels |
634 |
test r5 , r5 ; if (!block_h) |
635 |
jz .emuedge_v_extend_end_ %+ %%n ; goto end |
636 |
sub r1 , r2 ; src -= linesize |
637 |
READ_NUM_BYTES bottom, %%n, %1 ; read bytes |
638 |
.emuedge_extend_bottom_ %+ %%n %+ _loop: ; do { |
639 |
WRITE_NUM_BYTES bottom, %%n, %1 ; write bytes |
640 |
add r0 , r2 ; dst += linesize |
641 |
dec r5 |
642 |
jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h) |
643 |
|
644 |
.emuedge_v_extend_end_ %+ %%n: |
645 |
%ifdef ARCH_X86_64 |
646 |
ret |
647 |
%else ; ARCH_X86_32 |
648 |
rep ret |
649 |
%endif ; ARCH_X86_64/32 |
650 |
%assign %%n %%n+1 |
651 |
%endrep |
652 |
%endmacro VERTICAL_EXTEND |
653 |
|
654 |
; left/right (horizontal) fast extend functions |
655 |
; these are essentially identical to the vertical extend ones above, |
656 |
; just left/right separated because number of pixels to extend is |
657 |
; obviously not the same on both sides. |
658 |
; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the |
659 |
; lowest two bytes of the register (so val*0x0101), and are splatted |
660 |
; into each byte of mm0 as well if n_pixels >= 8 |
661 |
|
662 |
%macro READ_V_PIXEL 3 |
663 |
mov vall, %2 |
664 |
mov valh, vall |
665 |
%if %1 >= 8 |
666 |
movd mm0, vald |
667 |
%ifidn %3, mmx |
668 |
punpcklwd mm0, mm0 |
669 |
punpckldq mm0, mm0 |
670 |
%else ; !mmx |
671 |
pshufw mm0, mm0, 0 |
672 |
%endif ; mmx |
673 |
%endif ; %1 >= 8 |
674 |
%endmacro |
675 |
|
676 |
%macro WRITE_V_PIXEL 2 |
677 |
%assign %%dst_off 0 |
678 |
%rep %1/8 |
679 |
movq [%2+%%dst_off], mm0 |
680 |
%assign %%dst_off %%dst_off+8 |
681 |
%endrep |
682 |
%if %1 & 4 |
683 |
%if %1 >= 8 |
684 |
movd [%2+%%dst_off], mm0 |
685 |
%else ; %1 < 8 |
686 |
mov [%2+%%dst_off] , valw |
687 |
mov [%2+%%dst_off+2], valw |
688 |
%endif ; %1 >=/< 8 |
689 |
%assign %%dst_off %%dst_off+4 |
690 |
%endif ; %1 & 4 |
691 |
%if %1&2 |
692 |
mov [%2+%%dst_off], valw |
693 |
%endif ; %1 & 2 |
694 |
%endmacro |
695 |
|
696 |
; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val |
697 |
%macro LEFT_EXTEND 1 |
698 |
%assign %%n 2 |
699 |
%rep 11 |
700 |
ALIGN 64 |
701 |
.emuedge_extend_left_ %+ %%n: ; do { |
702 |
sub r0, r2 ; dst -= linesize |
703 |
READ_V_PIXEL %%n, [r0+r1], %1 ; read pixels |
704 |
WRITE_V_PIXEL %%n, r0 ; write pixels |
705 |
dec r5 |
706 |
jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h) |
707 |
%ifdef ARCH_X86_64 |
708 |
ret |
709 |
%else ; ARCH_X86_32 |
710 |
rep ret |
711 |
%endif ; ARCH_X86_64/32 |
712 |
%assign %%n %%n+2 |
713 |
%endrep |
714 |
%endmacro ; LEFT_EXTEND |
715 |
|
716 |
; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val |
717 |
%macro RIGHT_EXTEND 1 |
718 |
%assign %%n 2 |
719 |
%rep 11 |
720 |
ALIGN 64 |
721 |
.emuedge_extend_right_ %+ %%n: ; do { |
722 |
%ifdef ARCH_X86_64 |
723 |
sub r3, r2 ; dst -= linesize |
724 |
READ_V_PIXEL %%n, [r3+w_reg-1], %1 ; read pixels |
725 |
WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels |
726 |
dec r11 |
727 |
%else ; ARCH_X86_32 |
728 |
sub r0, r2 ; dst -= linesize |
729 |
READ_V_PIXEL %%n, [r0+w_reg-1], %1 ; read pixels |
730 |
WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels |
731 |
dec r5 |
732 |
%endif ; ARCH_X86_64/32 |
733 |
jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h) |
734 |
%ifdef ARCH_X86_64 |
735 |
ret |
736 |
%else ; ARCH_X86_32 |
737 |
rep ret |
738 |
%endif ; ARCH_X86_64/32 |
739 |
%assign %%n %%n+2 |
740 |
%endrep |
741 |
|
742 |
%ifdef ARCH_X86_32 |
743 |
%define stack_offset 0x10 |
744 |
%endif |
745 |
%endmacro ; RIGHT_EXTEND |
746 |
|
747 |
; below follow the "slow" copy/extend functions, these act on a non-fixed |
748 |
; width specified in a register, and run a loop to copy the full amount |
749 |
; of bytes. They are optimized for copying of large amounts of pixels per |
750 |
; line, so they unconditionally splat data into mm registers to copy 8 |
751 |
; bytes per loop iteration. It could be considered to use xmm for x86-64 |
752 |
; also, but I haven't optimized this as much (i.e. FIXME) |
753 |
%macro V_COPY_NPX 4-5 |
754 |
%if %0 == 4 |
755 |
test w_reg, %4 |
756 |
jz .%1_skip_%4_px |
757 |
%else ; %0 == 5 |
758 |
.%1_%4_px_loop: |
759 |
%endif |
760 |
%3 %2, [r1+cnt_reg] |
761 |
%3 [r0+cnt_reg], %2 |
762 |
add cnt_reg, %4 |
763 |
%if %0 == 5 |
764 |
sub w_reg, %4 |
765 |
test w_reg, %5 |
766 |
jnz .%1_%4_px_loop |
767 |
%endif |
768 |
.%1_skip_%4_px: |
769 |
%endmacro |
770 |
|
771 |
%macro V_COPY_ROW 3 |
772 |
%ifidn %1, bottom |
773 |
sub r1, linesize |
774 |
%endif |
775 |
.%1_copy_loop: |
776 |
xor cnt_reg, cnt_reg |
777 |
%ifidn %3, mmx |
778 |
%define linesize r2m |
779 |
V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8 |
780 |
%else ; !mmx |
781 |
V_COPY_NPX %1, xmm0, movdqu, 16, 0xFFFFFFF0 |
782 |
%ifdef ARCH_X86_64 |
783 |
%define linesize r2 |
784 |
V_COPY_NPX %1, rax , mov, 8 |
785 |
%else ; ARCH_X86_32 |
786 |
%define linesize r2m |
787 |
V_COPY_NPX %1, mm0, movq, 8 |
788 |
%endif ; ARCH_X86_64/32 |
789 |
%endif ; mmx |
790 |
V_COPY_NPX %1, vald, mov, 4 |
791 |
V_COPY_NPX %1, valw, mov, 2 |
792 |
V_COPY_NPX %1, vall, mov, 1 |
793 |
mov w_reg, cnt_reg |
794 |
%ifidn %1, body |
795 |
add r1, linesize |
796 |
%endif |
797 |
add r0, linesize |
798 |
dec %2 |
799 |
jnz .%1_copy_loop |
800 |
%endmacro |
801 |
|
802 |
%macro SLOW_V_EXTEND 1 |
803 |
.slow_v_extend_loop: |
804 |
; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h |
805 |
; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x |
806 |
%ifdef ARCH_X86_64 |
807 |
push r11 ; save old value of block_h |
808 |
test r3, r3 |
809 |
%define cnt_reg r11 |
810 |
jz .do_body_copy ; if (!start_y) goto do_body_copy |
811 |
V_COPY_ROW top, r3, %1 |
812 |
%else |
813 |
cmp dword r3m, 0 |
814 |
%define cnt_reg r2 |
815 |
je .do_body_copy ; if (!start_y) goto do_body_copy |
816 |
V_COPY_ROW top, dword r3m, %1 |
817 |
%endif |
818 |
|
819 |
.do_body_copy: |
820 |
V_COPY_ROW body, r4, %1 |
821 |
|
822 |
%ifdef ARCH_X86_64 |
823 |
pop r11 ; restore old value of block_h |
824 |
%define cnt_reg r3 |
825 |
%endif |
826 |
test r5, r5 |
827 |
%ifdef ARCH_X86_64 |
828 |
jz .v_extend_end |
829 |
%else |
830 |
jz .skip_bottom_extend |
831 |
%endif |
832 |
V_COPY_ROW bottom, r5, %1 |
833 |
%ifdef ARCH_X86_32 |
834 |
.skip_bottom_extend: |
835 |
mov r2, r2m |
836 |
%endif |
837 |
jmp .v_extend_end |
838 |
%endmacro |
839 |
|
840 |
%macro SLOW_LEFT_EXTEND 1 |
841 |
.slow_left_extend_loop: |
842 |
; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x |
843 |
mov r4, 8 |
844 |
sub r0, linesize |
845 |
READ_V_PIXEL 8, [r0+w_reg], %1 |
846 |
.left_extend_8px_loop: |
847 |
movq [r0+r4-8], mm0 |
848 |
add r4, 8 |
849 |
cmp r4, w_reg |
850 |
jle .left_extend_8px_loop |
851 |
sub r4, 8 |
852 |
cmp r4, w_reg |
853 |
jge .left_extend_loop_end |
854 |
.left_extend_2px_loop: |
855 |
mov [r0+r4], valw |
856 |
add r4, 2 |
857 |
cmp r4, w_reg |
858 |
jl .left_extend_2px_loop |
859 |
.left_extend_loop_end: |
860 |
dec r5 |
861 |
jnz .slow_left_extend_loop |
862 |
%ifdef ARCH_X86_32 |
863 |
mov r2, r2m |
864 |
%endif |
865 |
jmp .right_extend |
866 |
%endmacro |
867 |
|
868 |
%macro SLOW_RIGHT_EXTEND 1 |
869 |
.slow_right_extend_loop: |
870 |
; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h, |
871 |
; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr |
872 |
%ifdef ARCH_X86_64 |
873 |
%define buf_reg r3 |
874 |
%define bh_reg r11 |
875 |
%else |
876 |
%define buf_reg r0 |
877 |
%define bh_reg r5 |
878 |
%endif |
879 |
lea r1, [r4-8] |
880 |
sub buf_reg, linesize |
881 |
READ_V_PIXEL 8, [buf_reg+w_reg-1], %1 |
882 |
.right_extend_8px_loop: |
883 |
movq [buf_reg+r1], mm0 |
884 |
sub r1, 8 |
885 |
cmp r1, w_reg |
886 |
jge .right_extend_8px_loop |
887 |
add r1, 8 |
888 |
cmp r1, w_reg |
889 |
je .right_extend_loop_end |
890 |
.right_extend_2px_loop: |
891 |
sub r1, 2 |
892 |
mov [buf_reg+r1], valw |
893 |
cmp r1, w_reg |
894 |
jg .right_extend_2px_loop |
895 |
.right_extend_loop_end: |
896 |
dec bh_reg |
897 |
jnz .slow_right_extend_loop |
898 |
jmp .h_extend_end |
899 |
%endmacro |
900 |
|
901 |
%macro emu_edge 1 |
902 |
EMU_EDGE_FUNC %1 |
903 |
VERTICAL_EXTEND %1 |
904 |
LEFT_EXTEND %1 |
905 |
RIGHT_EXTEND %1 |
906 |
SLOW_V_EXTEND %1 |
907 |
SLOW_LEFT_EXTEND %1 |
908 |
SLOW_RIGHT_EXTEND %1 |
909 |
%endmacro |
910 |
|
911 |
emu_edge sse |
912 |
%ifdef ARCH_X86_32 |
913 |
emu_edge mmx |
914 |
%endif |