ffmpeg / libavcodec / x86 / dsputil_yasm.asm @ 888fa31e
History | View | Annotate | Download (27.6 KB)
1 | 7ca7d5fa | Loren Merritt | ;****************************************************************************** |
---|---|---|---|
2 | ;* MMX optimized DSP utils |
||
3 | ;* Copyright (c) 2008 Loren Merritt |
||
4 | ;* |
||
5 | 2912e87a | Mans Rullgard | ;* This file is part of Libav. |
6 | 7ca7d5fa | Loren Merritt | ;* |
7 | 2912e87a | Mans Rullgard | ;* Libav is free software; you can redistribute it and/or |
8 | 7ca7d5fa | Loren Merritt | ;* modify it under the terms of the GNU Lesser General Public |
9 | ;* License as published by the Free Software Foundation; either |
||
10 | ;* version 2.1 of the License, or (at your option) any later version. |
||
11 | ;* |
||
12 | 2912e87a | Mans Rullgard | ;* Libav is distributed in the hope that it will be useful, |
13 | 7ca7d5fa | Loren Merritt | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
15 | ;* Lesser General Public License for more details. |
||
16 | ;* |
||
17 | ;* You should have received a copy of the GNU Lesser General Public |
||
18 | 2912e87a | Mans Rullgard | ;* License along with Libav; if not, write to the Free Software |
19 | 888fa31e | Diego Biurrun | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 | 7ca7d5fa | Loren Merritt | ;****************************************************************************** |
21 | |||
22 | %include "x86inc.asm" |
||
23 | |||
24 | 2f77923d | Loren Merritt | SECTION_RODATA |
25 | pb_f: times 16 db 15 |
||
26 | pb_zzzzzzzz77777777: times 8 db -1 |
||
27 | pb_7: times 8 db 7 |
||
28 | pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 |
||
29 | pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 |
||
30 | e6e98234 | Justin Ruggles | pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 |
31 | pd_16384: times 4 dd 16384 |
||
32 | 2f77923d | Loren Merritt | |
33 | 7ca7d5fa | Loren Merritt | section .text align=16 |
34 | |||
35 | b10fa1bb | Loren Merritt | %macro SCALARPRODUCT 1 |
36 | b1159ad9 | Loren Merritt | ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) |
37 | b10fa1bb | Loren Merritt | cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift |
38 | shl orderq, 1 |
||
39 | add v1q, orderq |
||
40 | add v2q, orderq |
||
41 | neg orderq |
||
42 | movd m3, shiftm |
||
43 | pxor m2, m2 |
||
44 | .loop: |
||
45 | movu m0, [v1q + orderq] |
||
46 | movu m1, [v1q + orderq + mmsize] |
||
47 | pmaddwd m0, [v2q + orderq] |
||
48 | pmaddwd m1, [v2q + orderq + mmsize] |
||
49 | paddd m2, m0 |
||
50 | paddd m2, m1 |
||
51 | add orderq, mmsize*2 |
||
52 | jl .loop |
||
53 | %if mmsize == 16 |
||
54 | movhlps m0, m2 |
||
55 | paddd m2, m0 |
||
56 | psrad m2, m3 |
||
57 | pshuflw m0, m2, 0x4e |
||
58 | %else |
||
59 | psrad m2, m3 |
||
60 | pshufw m0, m2, 0x4e |
||
61 | %endif |
||
62 | paddd m2, m0 |
||
63 | movd eax, m2 |
||
64 | RET |
||
65 | b1159ad9 | Loren Merritt | |
66 | ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) |
||
67 | 758c7455 | Loren Merritt | cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul |
68 | b1159ad9 | Loren Merritt | shl orderq, 1 |
69 | movd m7, mulm |
||
70 | %if mmsize == 16 |
||
71 | pshuflw m7, m7, 0 |
||
72 | punpcklqdq m7, m7 |
||
73 | %else |
||
74 | pshufw m7, m7, 0 |
||
75 | %endif |
||
76 | pxor m6, m6 |
||
77 | add v1q, orderq |
||
78 | add v2q, orderq |
||
79 | add v3q, orderq |
||
80 | neg orderq |
||
81 | .loop: |
||
82 | movu m0, [v2q + orderq] |
||
83 | movu m1, [v2q + orderq + mmsize] |
||
84 | mova m4, [v1q + orderq] |
||
85 | mova m5, [v1q + orderq + mmsize] |
||
86 | movu m2, [v3q + orderq] |
||
87 | movu m3, [v3q + orderq + mmsize] |
||
88 | pmaddwd m0, m4 |
||
89 | pmaddwd m1, m5 |
||
90 | pmullw m2, m7 |
||
91 | pmullw m3, m7 |
||
92 | paddd m6, m0 |
||
93 | paddd m6, m1 |
||
94 | paddw m2, m4 |
||
95 | paddw m3, m5 |
||
96 | mova [v1q + orderq], m2 |
||
97 | mova [v1q + orderq + mmsize], m3 |
||
98 | add orderq, mmsize*2 |
||
99 | jl .loop |
||
100 | %if mmsize == 16 |
||
101 | movhlps m0, m6 |
||
102 | paddd m6, m0 |
||
103 | pshuflw m0, m6, 0x4e |
||
104 | %else |
||
105 | pshufw m0, m6, 0x4e |
||
106 | %endif |
||
107 | paddd m6, m0 |
||
108 | movd eax, m6 |
||
109 | RET |
||
110 | b10fa1bb | Loren Merritt | %endmacro |
111 | |||
112 | INIT_MMX |
||
113 | SCALARPRODUCT mmx2 |
||
114 | INIT_XMM |
||
115 | SCALARPRODUCT sse2 |
||
116 | |||
117 | b1159ad9 | Loren Merritt | %macro SCALARPRODUCT_LOOP 1 |
118 | align 16 |
||
119 | .loop%1: |
||
120 | sub orderq, mmsize*2 |
||
121 | %if %1 |
||
122 | mova m1, m4 |
||
123 | mova m4, [v2q + orderq] |
||
124 | mova m0, [v2q + orderq + mmsize] |
||
125 | palignr m1, m0, %1 |
||
126 | palignr m0, m4, %1 |
||
127 | mova m3, m5 |
||
128 | mova m5, [v3q + orderq] |
||
129 | mova m2, [v3q + orderq + mmsize] |
||
130 | palignr m3, m2, %1 |
||
131 | palignr m2, m5, %1 |
||
132 | %else |
||
133 | mova m0, [v2q + orderq] |
||
134 | mova m1, [v2q + orderq + mmsize] |
||
135 | mova m2, [v3q + orderq] |
||
136 | mova m3, [v3q + orderq + mmsize] |
||
137 | %endif |
||
138 | a4605efd | Loren Merritt | %define t0 [v1q + orderq] |
139 | %define t1 [v1q + orderq + mmsize] |
||
140 | %ifdef ARCH_X86_64 |
||
141 | mova m8, t0 |
||
142 | mova m9, t1 |
||
143 | %define t0 m8 |
||
144 | %define t1 m9 |
||
145 | %endif |
||
146 | pmaddwd m0, t0 |
||
147 | pmaddwd m1, t1 |
||
148 | b1159ad9 | Loren Merritt | pmullw m2, m7 |
149 | pmullw m3, m7 |
||
150 | a4605efd | Loren Merritt | paddw m2, t0 |
151 | paddw m3, t1 |
||
152 | b1159ad9 | Loren Merritt | paddd m6, m0 |
153 | paddd m6, m1 |
||
154 | mova [v1q + orderq], m2 |
||
155 | mova [v1q + orderq + mmsize], m3 |
||
156 | jg .loop%1 |
||
157 | %if %1 |
||
158 | jmp .end |
||
159 | %endif |
||
160 | %endmacro |
||
161 | |||
162 | ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) |
||
163 | a4605efd | Loren Merritt | cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul |
164 | b1159ad9 | Loren Merritt | shl orderq, 1 |
165 | movd m7, mulm |
||
166 | pshuflw m7, m7, 0 |
||
167 | punpcklqdq m7, m7 |
||
168 | pxor m6, m6 |
||
169 | mov r4d, v2d |
||
170 | and r4d, 15 |
||
171 | and v2q, ~15 |
||
172 | and v3q, ~15 |
||
173 | mova m4, [v2q + orderq] |
||
174 | mova m5, [v3q + orderq] |
||
175 | ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) |
||
176 | cmp r4d, 0 |
||
177 | je .loop0 |
||
178 | cmp r4d, 2 |
||
179 | je .loop2 |
||
180 | cmp r4d, 4 |
||
181 | je .loop4 |
||
182 | cmp r4d, 6 |
||
183 | je .loop6 |
||
184 | cmp r4d, 8 |
||
185 | je .loop8 |
||
186 | cmp r4d, 10 |
||
187 | je .loop10 |
||
188 | cmp r4d, 12 |
||
189 | je .loop12 |
||
190 | SCALARPRODUCT_LOOP 14 |
||
191 | SCALARPRODUCT_LOOP 12 |
||
192 | SCALARPRODUCT_LOOP 10 |
||
193 | SCALARPRODUCT_LOOP 8 |
||
194 | SCALARPRODUCT_LOOP 6 |
||
195 | SCALARPRODUCT_LOOP 4 |
||
196 | SCALARPRODUCT_LOOP 2 |
||
197 | SCALARPRODUCT_LOOP 0 |
||
198 | .end: |
||
199 | movhlps m0, m6 |
||
200 | paddd m6, m0 |
||
201 | pshuflw m0, m6, 0x4e |
||
202 | paddd m6, m0 |
||
203 | movd eax, m6 |
||
204 | RET |
||
205 | |||
206 | b10fa1bb | Loren Merritt | |
207 | e6e98234 | Justin Ruggles | ;----------------------------------------------------------------------------- |
208 | ; void ff_apply_window_int16(int16_t *output, const int16_t *input, |
||
209 | ; const int16_t *window, unsigned int len) |
||
210 | ;----------------------------------------------------------------------------- |
||
211 | |||
212 | %macro REVERSE_WORDS_MMXEXT 1-2 |
||
213 | pshufw %1, %1, 0x1B |
||
214 | %endmacro |
||
215 | |||
216 | %macro REVERSE_WORDS_SSE2 1-2 |
||
217 | pshuflw %1, %1, 0x1B |
||
218 | pshufhw %1, %1, 0x1B |
||
219 | pshufd %1, %1, 0x4E |
||
220 | %endmacro |
||
221 | |||
222 | %macro REVERSE_WORDS_SSSE3 2 |
||
223 | pshufb %1, %2 |
||
224 | %endmacro |
||
225 | |||
226 | ; dst = (dst * src) >> 15 |
||
227 | ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back |
||
228 | ; in from the pmullw result. |
||
229 | %macro MUL16FIXED_MMXEXT 3 ; dst, src, temp |
||
230 | mova %3, %1 |
||
231 | pmulhw %1, %2 |
||
232 | pmullw %3, %2 |
||
233 | psrlw %3, 15 |
||
234 | psllw %1, 1 |
||
235 | por %1, %3 |
||
236 | %endmacro |
||
237 | |||
238 | ; dst = ((dst * src) + (1<<14)) >> 15 |
||
239 | %macro MUL16FIXED_SSSE3 3 ; dst, src, unused |
||
240 | pmulhrsw %1, %2 |
||
241 | %endmacro |
||
242 | |||
243 | %macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3 |
||
244 | cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2 |
||
245 | lea offset2q, [offsetq-mmsize] |
||
246 | %if %2 |
||
247 | mova m5, [pd_16384] |
||
248 | %elifidn %1, ssse3 |
||
249 | mova m5, [pb_revwords] |
||
250 | ALIGN 16 |
||
251 | %endif |
||
252 | .loop: |
||
253 | %if %2 |
||
254 | ; This version expands 16-bit to 32-bit, multiplies by the window, |
||
255 | ; adds 16384 for rounding, right shifts 15, then repacks back to words to |
||
256 | ; save to the output. The window is reversed for the second half. |
||
257 | mova m3, [windowq+offset2q] |
||
258 | mova m4, [ inputq+offset2q] |
||
259 | pxor m0, m0 |
||
260 | punpcklwd m0, m3 |
||
261 | punpcklwd m1, m4 |
||
262 | pmaddwd m0, m1 |
||
263 | paddd m0, m5 |
||
264 | psrad m0, 15 |
||
265 | pxor m2, m2 |
||
266 | punpckhwd m2, m3 |
||
267 | punpckhwd m1, m4 |
||
268 | pmaddwd m2, m1 |
||
269 | paddd m2, m5 |
||
270 | psrad m2, 15 |
||
271 | packssdw m0, m2 |
||
272 | mova [outputq+offset2q], m0 |
||
273 | REVERSE_WORDS m3 |
||
274 | mova m4, [ inputq+offsetq] |
||
275 | pxor m0, m0 |
||
276 | punpcklwd m0, m3 |
||
277 | punpcklwd m1, m4 |
||
278 | pmaddwd m0, m1 |
||
279 | paddd m0, m5 |
||
280 | psrad m0, 15 |
||
281 | pxor m2, m2 |
||
282 | punpckhwd m2, m3 |
||
283 | punpckhwd m1, m4 |
||
284 | pmaddwd m2, m1 |
||
285 | paddd m2, m5 |
||
286 | psrad m2, 15 |
||
287 | packssdw m0, m2 |
||
288 | mova [outputq+offsetq], m0 |
||
289 | %elif %3 |
||
290 | ; This version does the 16x16->16 multiplication in-place without expanding |
||
291 | ; to 32-bit. The ssse3 version is bit-identical. |
||
292 | mova m0, [windowq+offset2q] |
||
293 | mova m1, [ inputq+offset2q] |
||
294 | pmulhrsw m1, m0 |
||
295 | REVERSE_WORDS m0, m5 |
||
296 | pmulhrsw m0, [ inputq+offsetq ] |
||
297 | mova [outputq+offset2q], m1 |
||
298 | mova [outputq+offsetq ], m0 |
||
299 | %else |
||
300 | ; This version does the 16x16->16 multiplication in-place without expanding |
||
301 | ; to 32-bit. The mmxext and sse2 versions do not use rounding, and |
||
302 | ; therefore are not bit-identical to the C version. |
||
303 | mova m0, [windowq+offset2q] |
||
304 | mova m1, [ inputq+offset2q] |
||
305 | mova m2, [ inputq+offsetq ] |
||
306 | MUL16FIXED m1, m0, m3 |
||
307 | REVERSE_WORDS m0 |
||
308 | MUL16FIXED m2, m0, m3 |
||
309 | mova [outputq+offset2q], m1 |
||
310 | mova [outputq+offsetq ], m2 |
||
311 | %endif |
||
312 | add offsetd, mmsize |
||
313 | sub offset2d, mmsize |
||
314 | jae .loop |
||
315 | REP_RET |
||
316 | %endmacro |
||
317 | |||
318 | INIT_MMX |
||
319 | %define REVERSE_WORDS REVERSE_WORDS_MMXEXT |
||
320 | %define MUL16FIXED MUL16FIXED_MMXEXT |
||
321 | APPLY_WINDOW_INT16 mmxext, 0, 0 |
||
322 | APPLY_WINDOW_INT16 mmxext_ba, 1, 0 |
||
323 | INIT_XMM |
||
324 | %define REVERSE_WORDS REVERSE_WORDS_SSE2 |
||
325 | APPLY_WINDOW_INT16 sse2, 0, 0 |
||
326 | APPLY_WINDOW_INT16 sse2_ba, 1, 0 |
||
327 | APPLY_WINDOW_INT16 ssse3_atom, 0, 1 |
||
328 | %define REVERSE_WORDS REVERSE_WORDS_SSSE3 |
||
329 | APPLY_WINDOW_INT16 ssse3, 0, 1 |
||
330 | |||
331 | b10fa1bb | Loren Merritt | |
332 | 2966cc18 | Jason Garrett-Glaser | ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) |
333 | 3daa434a | Loren Merritt | cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top |
334 | movq mm0, [topq] |
||
335 | movq mm2, mm0 |
||
336 | movd mm4, [left_topq] |
||
337 | psllq mm2, 8 |
||
338 | movq mm1, mm0 |
||
339 | por mm4, mm2 |
||
340 | movd mm3, [leftq] |
||
341 | psubb mm0, mm4 ; t-tl |
||
342 | add dstq, wq |
||
343 | add topq, wq |
||
344 | add diffq, wq |
||
345 | neg wq |
||
346 | jmp .skip |
||
347 | .loop: |
||
348 | movq mm4, [topq+wq] |
||
349 | movq mm0, mm4 |
||
350 | psllq mm4, 8 |
||
351 | por mm4, mm1 |
||
352 | movq mm1, mm0 ; t |
||
353 | psubb mm0, mm4 ; t-tl |
||
354 | .skip: |
||
355 | movq mm2, [diffq+wq] |
||
356 | %assign i 0 |
||
357 | %rep 8 |
||
358 | movq mm4, mm0 |
||
359 | paddb mm4, mm3 ; t-tl+l |
||
360 | movq mm5, mm3 |
||
361 | pmaxub mm3, mm1 |
||
362 | pminub mm5, mm1 |
||
363 | pminub mm3, mm4 |
||
364 | pmaxub mm3, mm5 ; median |
||
365 | paddb mm3, mm2 ; +residual |
||
366 | %if i==0 |
||
367 | movq mm7, mm3 |
||
368 | psllq mm7, 56 |
||
369 | %else |
||
370 | movq mm6, mm3 |
||
371 | psrlq mm7, 8 |
||
372 | psllq mm6, 56 |
||
373 | por mm7, mm6 |
||
374 | %endif |
||
375 | %if i<7 |
||
376 | psrlq mm0, 8 |
||
377 | psrlq mm1, 8 |
||
378 | psrlq mm2, 8 |
||
379 | %endif |
||
380 | %assign i i+1 |
||
381 | %endrep |
||
382 | movq [dstq+wq], mm7 |
||
383 | add wq, 8 |
||
384 | jl .loop |
||
385 | movzx r2d, byte [dstq-1] |
||
386 | mov [leftq], r2d |
||
387 | movzx r2d, byte [topq-1] |
||
388 | mov [left_topq], r2d |
||
389 | RET |
||
390 | 2f77923d | Loren Merritt | |
391 | |||
392 | %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned |
||
393 | add srcq, wq |
||
394 | add dstq, wq |
||
395 | neg wq |
||
396 | %%.loop: |
||
397 | mova m1, [srcq+wq] |
||
398 | mova m2, m1 |
||
399 | psllw m1, 8 |
||
400 | paddb m1, m2 |
||
401 | mova m2, m1 |
||
402 | pshufb m1, m3 |
||
403 | paddb m1, m2 |
||
404 | pshufb m0, m5 |
||
405 | mova m2, m1 |
||
406 | pshufb m1, m4 |
||
407 | paddb m1, m2 |
||
408 | %if mmsize == 16 |
||
409 | mova m2, m1 |
||
410 | pshufb m1, m6 |
||
411 | paddb m1, m2 |
||
412 | %endif |
||
413 | paddb m0, m1 |
||
414 | %if %1 |
||
415 | mova [dstq+wq], m0 |
||
416 | %else |
||
417 | movq [dstq+wq], m0 |
||
418 | movhps [dstq+wq+8], m0 |
||
419 | %endif |
||
420 | add wq, mmsize |
||
421 | jl %%.loop |
||
422 | mov eax, mmsize-1 |
||
423 | sub eax, wd |
||
424 | movd m1, eax |
||
425 | pshufb m0, m1 |
||
426 | movd eax, m0 |
||
427 | RET |
||
428 | %endmacro |
||
429 | |||
430 | 2966cc18 | Jason Garrett-Glaser | ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) |
431 | 2f77923d | Loren Merritt | INIT_MMX |
432 | cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left |
||
433 | .skip_prologue: |
||
434 | 2966cc18 | Jason Garrett-Glaser | mova m5, [pb_7] |
435 | mova m4, [pb_zzzz3333zzzzbbbb] |
||
436 | mova m3, [pb_zz11zz55zz99zzdd] |
||
437 | 2f77923d | Loren Merritt | movd m0, leftm |
438 | psllq m0, 56 |
||
439 | ADD_HFYU_LEFT_LOOP 1 |
||
440 | |||
441 | INIT_XMM |
||
442 | cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left |
||
443 | 2966cc18 | Jason Garrett-Glaser | mova m5, [pb_f] |
444 | mova m6, [pb_zzzzzzzz77777777] |
||
445 | mova m4, [pb_zzzz3333zzzzbbbb] |
||
446 | mova m3, [pb_zz11zz55zz99zzdd] |
||
447 | 2f77923d | Loren Merritt | movd m0, leftm |
448 | pslldq m0, 15 |
||
449 | test srcq, 15 |
||
450 | b07781b6 | Loren Merritt | jnz add_hfyu_left_prediction_ssse3.skip_prologue |
451 | 2f77923d | Loren Merritt | test dstq, 15 |
452 | jnz .unaligned |
||
453 | ADD_HFYU_LEFT_LOOP 1 |
||
454 | .unaligned: |
||
455 | ADD_HFYU_LEFT_LOOP 0 |
||
456 | |||
457 | 3deb5384 | Alex Converse | |
458 | 2966cc18 | Jason Garrett-Glaser | ; float scalarproduct_float_sse(const float *v1, const float *v2, int len) |
459 | 3deb5384 | Alex Converse | cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset |
460 | neg offsetq |
||
461 | shl offsetq, 2 |
||
462 | sub v1q, offsetq |
||
463 | sub v2q, offsetq |
||
464 | xorps xmm0, xmm0 |
||
465 | .loop: |
||
466 | movaps xmm1, [v1q+offsetq] |
||
467 | mulps xmm1, [v2q+offsetq] |
||
468 | addps xmm0, xmm1 |
||
469 | add offsetq, 16 |
||
470 | js .loop |
||
471 | movhlps xmm1, xmm0 |
||
472 | addps xmm0, xmm1 |
||
473 | movss xmm1, xmm0 |
||
474 | shufps xmm0, xmm0, 1 |
||
475 | addss xmm0, xmm1 |
||
476 | %ifndef ARCH_X86_64 |
||
477 | movd r0m, xmm0 |
||
478 | fld dword r0m |
||
479 | %endif |
||
480 | RET |
||
481 | 81f2a3f4 | Ronald S. Bultje | |
482 | ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize, |
||
483 | ; x86_reg start_y, x86_reg end_y, x86_reg block_h, |
||
484 | ; x86_reg start_x, x86_reg end_x, x86_reg block_w); |
||
485 | ; |
||
486 | ; The actual function itself is below. It basically wraps a very simple |
||
487 | ; w = end_x - start_x |
||
488 | ; if (w) { |
||
489 | ; if (w > 22) { |
||
490 | ; jump to the slow loop functions |
||
491 | ; } else { |
||
492 | ; jump to the fast loop functions |
||
493 | ; } |
||
494 | ; } |
||
495 | ; |
||
496 | ; ... and then the same for left/right extend also. See below for loop |
||
497 | ; function implementations. Fast are fixed-width, slow is variable-width |
||
498 | |||
499 | %macro EMU_EDGE_FUNC 1 |
||
500 | %ifdef ARCH_X86_64 |
||
501 | %define w_reg r10 |
||
502 | cglobal emu_edge_core_%1, 6, 7, 1 |
||
503 | mov r11, r5 ; save block_h |
||
504 | %else |
||
505 | %define w_reg r6 |
||
506 | cglobal emu_edge_core_%1, 2, 7, 0 |
||
507 | mov r4, r4m ; end_y |
||
508 | mov r5, r5m ; block_h |
||
509 | %endif |
||
510 | |||
511 | ; start with vertical extend (top/bottom) and body pixel copy |
||
512 | mov w_reg, r7m |
||
513 | sub w_reg, r6m ; w = start_x - end_x |
||
514 | sub r5, r4 |
||
515 | %ifdef ARCH_X86_64 |
||
516 | sub r4, r3 |
||
517 | %else |
||
518 | sub r4, dword r3m |
||
519 | %endif |
||
520 | cmp w_reg, 22 |
||
521 | jg .slow_v_extend_loop |
||
522 | %ifdef ARCH_X86_32 |
||
523 | mov r2, r2m ; linesize |
||
524 | %endif |
||
525 | sal w_reg, 7 ; w * 128 |
||
526 | %ifdef PIC |
||
527 | lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)] |
||
528 | add w_reg, rax |
||
529 | %else |
||
530 | lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg] |
||
531 | %endif |
||
532 | call w_reg ; fast top extend, body copy and bottom extend |
||
533 | .v_extend_end: |
||
534 | |||
535 | ; horizontal extend (left/right) |
||
536 | mov w_reg, r6m ; start_x |
||
537 | sub r0, w_reg |
||
538 | %ifdef ARCH_X86_64 |
||
539 | mov r3, r0 ; backup of buf+block_h*linesize |
||
540 | mov r5, r11 |
||
541 | %else |
||
542 | mov r0m, r0 ; backup of buf+block_h*linesize |
||
543 | mov r5, r5m |
||
544 | %endif |
||
545 | test w_reg, w_reg |
||
546 | jz .right_extend |
||
547 | cmp w_reg, 22 |
||
548 | jg .slow_left_extend_loop |
||
549 | mov r1, w_reg |
||
550 | dec w_reg |
||
551 | ; FIXME we can do a if size == 1 here if that makes any speed difference, test me |
||
552 | sar w_reg, 1 |
||
553 | sal w_reg, 6 |
||
554 | ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs |
||
555 | ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h |
||
556 | %ifdef PIC |
||
557 | lea rax, [.emuedge_extend_left_2] |
||
558 | add w_reg, rax |
||
559 | %else |
||
560 | lea w_reg, [.emuedge_extend_left_2+w_reg] |
||
561 | %endif |
||
562 | call w_reg |
||
563 | |||
564 | ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w |
||
565 | .right_extend: |
||
566 | %ifdef ARCH_X86_32 |
||
567 | mov r0, r0m |
||
568 | mov r5, r5m |
||
569 | %endif |
||
570 | mov w_reg, r7m ; end_x |
||
571 | mov r1, r8m ; block_w |
||
572 | mov r4, r1 |
||
573 | sub r1, w_reg |
||
574 | jz .h_extend_end ; if (end_x == block_w) goto h_extend_end |
||
575 | cmp r1, 22 |
||
576 | jg .slow_right_extend_loop |
||
577 | dec r1 |
||
578 | ; FIXME we can do a if size == 1 here if that makes any speed difference, test me |
||
579 | sar r1, 1 |
||
580 | sal r1, 6 |
||
581 | %ifdef PIC |
||
582 | lea rax, [.emuedge_extend_right_2] |
||
583 | add r1, rax |
||
584 | %else |
||
585 | lea r1, [.emuedge_extend_right_2+r1] |
||
586 | %endif |
||
587 | call r1 |
||
588 | .h_extend_end: |
||
589 | RET |
||
590 | |||
591 | %ifdef ARCH_X86_64 |
||
592 | %define vall al |
||
593 | %define valh ah |
||
594 | %define valw ax |
||
595 | %define valw2 r10w |
||
596 | %define valw3 r3w |
||
597 | 17cf7c68 | Ronald S. Bultje | %ifdef WIN64 |
598 | %define valw4 r4w |
||
599 | %else ; unix64 |
||
600 | %define valw4 r3w |
||
601 | %endif |
||
602 | 81f2a3f4 | Ronald S. Bultje | %define vald eax |
603 | %else |
||
604 | %define vall bl |
||
605 | %define valh bh |
||
606 | %define valw bx |
||
607 | %define valw2 r6w |
||
608 | %define valw3 valw2 |
||
609 | 17cf7c68 | Ronald S. Bultje | %define valw4 valw3 |
610 | 81f2a3f4 | Ronald S. Bultje | %define vald ebx |
611 | %define stack_offset 0x14 |
||
612 | %endif |
||
613 | |||
614 | %endmacro |
||
615 | |||
616 | ; macro to read/write a horizontal number of pixels (%2) to/from registers |
||
617 | ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels |
||
618 | ; - if (%2 & 15 == 8) fills the last 8 bytes into rax |
||
619 | ; - else if (%2 & 8) fills 8 bytes into mm0 |
||
620 | ; - if (%2 & 7 == 4) fills the last 4 bytes into rax |
||
621 | ; - else if (%2 & 4) fills 4 bytes into mm0-1 |
||
622 | ; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax |
||
623 | ; (note that we're using r3 for body/bottom because it's a shorter |
||
624 | ; opcode, and then the loop fits in 128 bytes) |
||
625 | ; - else fills remaining bytes into rax |
||
626 | ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels |
||
627 | ; - if (%2 & 7 == 4) fills 4 bytes into ebx |
||
628 | ; - else if (%2 & 4) fills 4 bytes into mm0-7 |
||
629 | ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx |
||
630 | ; - else fills remaining bytes into ebx |
||
631 | ; writing data out is in the same way |
||
632 | %macro READ_NUM_BYTES 3 |
||
633 | %assign %%src_off 0 ; offset in source buffer |
||
634 | %assign %%smidx 0 ; mmx register idx |
||
635 | %assign %%sxidx 0 ; xmm register idx |
||
636 | |||
637 | %ifnidn %3, mmx |
||
638 | %rep %2/16 |
||
639 | movdqu xmm %+ %%sxidx, [r1+%%src_off] |
||
640 | %assign %%src_off %%src_off+16 |
||
641 | %assign %%sxidx %%sxidx+1 |
||
642 | %endrep ; %2/16 |
||
643 | %endif ; !mmx |
||
644 | |||
645 | %ifdef ARCH_X86_64 |
||
646 | %if (%2-%%src_off) == 8 |
||
647 | mov rax, [r1+%%src_off] |
||
648 | %assign %%src_off %%src_off+8 |
||
649 | %endif ; (%2-%%src_off) == 8 |
||
650 | %endif ; x86-64 |
||
651 | |||
652 | %rep (%2-%%src_off)/8 |
||
653 | movq mm %+ %%smidx, [r1+%%src_off] |
||
654 | %assign %%src_off %%src_off+8 |
||
655 | %assign %%smidx %%smidx+1 |
||
656 | %endrep ; (%2-%%dst_off)/8 |
||
657 | |||
658 | %if (%2-%%src_off) == 4 |
||
659 | mov vald, [r1+%%src_off] |
||
660 | %elif (%2-%%src_off) & 4 |
||
661 | movd mm %+ %%smidx, [r1+%%src_off] |
||
662 | %assign %%src_off %%src_off+4 |
||
663 | %endif ; (%2-%%src_off) ==/& 4 |
||
664 | |||
665 | %if (%2-%%src_off) == 1 |
||
666 | mov vall, [r1+%%src_off] |
||
667 | %elif (%2-%%src_off) == 2 |
||
668 | mov valw, [r1+%%src_off] |
||
669 | %elif (%2-%%src_off) == 3 |
||
670 | %ifidn %1, top |
||
671 | mov valw2, [r1+%%src_off] |
||
672 | 17cf7c68 | Ronald S. Bultje | %elifidn %1, body |
673 | 81f2a3f4 | Ronald S. Bultje | mov valw3, [r1+%%src_off] |
674 | 17cf7c68 | Ronald S. Bultje | %elifidn %1, bottom |
675 | mov valw4, [r1+%%src_off] |
||
676 | 81f2a3f4 | Ronald S. Bultje | %endif ; %1 ==/!= top |
677 | mov vall, [r1+%%src_off+2] |
||
678 | %endif ; (%2-%%src_off) == 1/2/3 |
||
679 | %endmacro ; READ_NUM_BYTES |
||
680 | |||
681 | %macro WRITE_NUM_BYTES 3 |
||
682 | %assign %%dst_off 0 ; offset in destination buffer |
||
683 | %assign %%dmidx 0 ; mmx register idx |
||
684 | %assign %%dxidx 0 ; xmm register idx |
||
685 | |||
686 | %ifnidn %3, mmx |
||
687 | %rep %2/16 |
||
688 | movdqu [r0+%%dst_off], xmm %+ %%dxidx |
||
689 | %assign %%dst_off %%dst_off+16 |
||
690 | %assign %%dxidx %%dxidx+1 |
||
691 | %endrep ; %2/16 |
||
692 | %endif |
||
693 | |||
694 | %ifdef ARCH_X86_64 |
||
695 | %if (%2-%%dst_off) == 8 |
||
696 | mov [r0+%%dst_off], rax |
||
697 | %assign %%dst_off %%dst_off+8 |
||
698 | %endif ; (%2-%%dst_off) == 8 |
||
699 | %endif ; x86-64 |
||
700 | |||
701 | %rep (%2-%%dst_off)/8 |
||
702 | movq [r0+%%dst_off], mm %+ %%dmidx |
||
703 | %assign %%dst_off %%dst_off+8 |
||
704 | %assign %%dmidx %%dmidx+1 |
||
705 | %endrep ; (%2-%%dst_off)/8 |
||
706 | |||
707 | %if (%2-%%dst_off) == 4 |
||
708 | mov [r0+%%dst_off], vald |
||
709 | %elif (%2-%%dst_off) & 4 |
||
710 | movd [r0+%%dst_off], mm %+ %%dmidx |
||
711 | %assign %%dst_off %%dst_off+4 |
||
712 | %endif ; (%2-%%dst_off) ==/& 4 |
||
713 | |||
714 | %if (%2-%%dst_off) == 1 |
||
715 | mov [r0+%%dst_off], vall |
||
716 | %elif (%2-%%dst_off) == 2 |
||
717 | mov [r0+%%dst_off], valw |
||
718 | %elif (%2-%%dst_off) == 3 |
||
719 | %ifidn %1, top |
||
720 | mov [r0+%%dst_off], valw2 |
||
721 | 17cf7c68 | Ronald S. Bultje | %elifidn %1, body |
722 | 81f2a3f4 | Ronald S. Bultje | mov [r0+%%dst_off], valw3 |
723 | 17cf7c68 | Ronald S. Bultje | %elifidn %1, bottom |
724 | mov [r0+%%dst_off], valw4 |
||
725 | 81f2a3f4 | Ronald S. Bultje | %endif ; %1 ==/!= top |
726 | mov [r0+%%dst_off+2], vall |
||
727 | %endif ; (%2-%%dst_off) == 1/2/3 |
||
728 | %endmacro ; WRITE_NUM_BYTES |
||
729 | |||
730 | ; vertical top/bottom extend and body copy fast loops |
||
731 | ; these are function pointers to set-width line copy functions, i.e. |
||
732 | ; they read a fixed number of pixels into set registers, and write |
||
733 | ; those out into the destination buffer |
||
734 | ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h |
||
735 | ; r6(eax/64)/r3(ebx/32)=val_reg |
||
736 | %macro VERTICAL_EXTEND 1 |
||
737 | %assign %%n 1 |
||
738 | %rep 22 |
||
739 | ALIGN 128 |
||
740 | .emuedge_v_extend_ %+ %%n: |
||
741 | ; extend pixels above body |
||
742 | %ifdef ARCH_X86_64 |
||
743 | test r3 , r3 ; if (!start_y) |
||
744 | jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body |
||
745 | %else ; ARCH_X86_32 |
||
746 | cmp dword r3m, 0 |
||
747 | je .emuedge_copy_body_ %+ %%n %+ _loop |
||
748 | %endif ; ARCH_X86_64/32 |
||
749 | READ_NUM_BYTES top, %%n, %1 ; read bytes |
||
750 | .emuedge_extend_top_ %+ %%n %+ _loop: ; do { |
||
751 | WRITE_NUM_BYTES top, %%n, %1 ; write bytes |
||
752 | add r0 , r2 ; dst += linesize |
||
753 | %ifdef ARCH_X86_64 |
||
754 | 17cf7c68 | Ronald S. Bultje | dec r3d |
755 | 81f2a3f4 | Ronald S. Bultje | %else ; ARCH_X86_32 |
756 | dec dword r3m |
||
757 | %endif ; ARCH_X86_64/32 |
||
758 | jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y) |
||
759 | |||
760 | ; copy body pixels |
||
761 | .emuedge_copy_body_ %+ %%n %+ _loop: ; do { |
||
762 | READ_NUM_BYTES body, %%n, %1 ; read bytes |
||
763 | WRITE_NUM_BYTES body, %%n, %1 ; write bytes |
||
764 | add r0 , r2 ; dst += linesize |
||
765 | add r1 , r2 ; src += linesize |
||
766 | 17cf7c68 | Ronald S. Bultje | dec r4d |
767 | 81f2a3f4 | Ronald S. Bultje | jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y) |
768 | |||
769 | ; copy bottom pixels |
||
770 | test r5 , r5 ; if (!block_h) |
||
771 | jz .emuedge_v_extend_end_ %+ %%n ; goto end |
||
772 | sub r1 , r2 ; src -= linesize |
||
773 | READ_NUM_BYTES bottom, %%n, %1 ; read bytes |
||
774 | .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do { |
||
775 | WRITE_NUM_BYTES bottom, %%n, %1 ; write bytes |
||
776 | add r0 , r2 ; dst += linesize |
||
777 | 17cf7c68 | Ronald S. Bultje | dec r5d |
778 | 81f2a3f4 | Ronald S. Bultje | jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h) |
779 | |||
780 | .emuedge_v_extend_end_ %+ %%n: |
||
781 | %ifdef ARCH_X86_64 |
||
782 | ret |
||
783 | %else ; ARCH_X86_32 |
||
784 | rep ret |
||
785 | %endif ; ARCH_X86_64/32 |
||
786 | %assign %%n %%n+1 |
||
787 | %endrep |
||
788 | %endmacro VERTICAL_EXTEND |
||
789 | |||
790 | ; left/right (horizontal) fast extend functions |
||
791 | ; these are essentially identical to the vertical extend ones above, |
||
792 | ; just left/right separated because number of pixels to extend is |
||
793 | ; obviously not the same on both sides. |
||
794 | ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the |
||
795 | ; lowest two bytes of the register (so val*0x0101), and are splatted |
||
796 | ; into each byte of mm0 as well if n_pixels >= 8 |
||
797 | |||
798 | %macro READ_V_PIXEL 3 |
||
799 | mov vall, %2 |
||
800 | mov valh, vall |
||
801 | %if %1 >= 8 |
||
802 | movd mm0, vald |
||
803 | %ifidn %3, mmx |
||
804 | punpcklwd mm0, mm0 |
||
805 | punpckldq mm0, mm0 |
||
806 | %else ; !mmx |
||
807 | pshufw mm0, mm0, 0 |
||
808 | %endif ; mmx |
||
809 | %endif ; %1 >= 8 |
||
810 | %endmacro |
||
811 | |||
812 | %macro WRITE_V_PIXEL 2 |
||
813 | %assign %%dst_off 0 |
||
814 | %rep %1/8 |
||
815 | movq [%2+%%dst_off], mm0 |
||
816 | %assign %%dst_off %%dst_off+8 |
||
817 | %endrep |
||
818 | %if %1 & 4 |
||
819 | %if %1 >= 8 |
||
820 | movd [%2+%%dst_off], mm0 |
||
821 | %else ; %1 < 8 |
||
822 | mov [%2+%%dst_off] , valw |
||
823 | mov [%2+%%dst_off+2], valw |
||
824 | %endif ; %1 >=/< 8 |
||
825 | %assign %%dst_off %%dst_off+4 |
||
826 | %endif ; %1 & 4 |
||
827 | %if %1&2 |
||
828 | mov [%2+%%dst_off], valw |
||
829 | %endif ; %1 & 2 |
||
830 | %endmacro |
||
831 | |||
832 | ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val |
||
833 | %macro LEFT_EXTEND 1 |
||
834 | %assign %%n 2 |
||
835 | %rep 11 |
||
836 | ALIGN 64 |
||
837 | .emuedge_extend_left_ %+ %%n: ; do { |
||
838 | sub r0, r2 ; dst -= linesize |
||
839 | READ_V_PIXEL %%n, [r0+r1], %1 ; read pixels |
||
840 | WRITE_V_PIXEL %%n, r0 ; write pixels |
||
841 | dec r5 |
||
842 | jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h) |
||
843 | %ifdef ARCH_X86_64 |
||
844 | ret |
||
845 | %else ; ARCH_X86_32 |
||
846 | rep ret |
||
847 | %endif ; ARCH_X86_64/32 |
||
848 | %assign %%n %%n+2 |
||
849 | %endrep |
||
850 | %endmacro ; LEFT_EXTEND |
||
851 | |||
852 | ; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val |
||
853 | %macro RIGHT_EXTEND 1 |
||
854 | %assign %%n 2 |
||
855 | %rep 11 |
||
856 | ALIGN 64 |
||
857 | .emuedge_extend_right_ %+ %%n: ; do { |
||
858 | %ifdef ARCH_X86_64 |
||
859 | sub r3, r2 ; dst -= linesize |
||
860 | READ_V_PIXEL %%n, [r3+w_reg-1], %1 ; read pixels |
||
861 | WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels |
||
862 | dec r11 |
||
863 | %else ; ARCH_X86_32 |
||
864 | sub r0, r2 ; dst -= linesize |
||
865 | READ_V_PIXEL %%n, [r0+w_reg-1], %1 ; read pixels |
||
866 | WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels |
||
867 | dec r5 |
||
868 | %endif ; ARCH_X86_64/32 |
||
869 | jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h) |
||
870 | %ifdef ARCH_X86_64 |
||
871 | ret |
||
872 | %else ; ARCH_X86_32 |
||
873 | rep ret |
||
874 | %endif ; ARCH_X86_64/32 |
||
875 | %assign %%n %%n+2 |
||
876 | %endrep |
||
877 | |||
878 | %ifdef ARCH_X86_32 |
||
879 | %define stack_offset 0x10 |
||
880 | %endif |
||
881 | %endmacro ; RIGHT_EXTEND |
||
882 | |||
883 | ; below follow the "slow" copy/extend functions, these act on a non-fixed |
||
884 | ; width specified in a register, and run a loop to copy the full amount |
||
885 | ; of bytes. They are optimized for copying of large amounts of pixels per |
||
886 | ; line, so they unconditionally splat data into mm registers to copy 8 |
||
887 | ; bytes per loop iteration. It could be considered to use xmm for x86-64 |
||
888 | ; also, but I haven't optimized this as much (i.e. FIXME) |
||
889 | %macro V_COPY_NPX 4-5 |
||
890 | %if %0 == 4 |
||
891 | test w_reg, %4 |
||
892 | jz .%1_skip_%4_px |
||
893 | %else ; %0 == 5 |
||
894 | .%1_%4_px_loop: |
||
895 | %endif |
||
896 | %3 %2, [r1+cnt_reg] |
||
897 | %3 [r0+cnt_reg], %2 |
||
898 | add cnt_reg, %4 |
||
899 | %if %0 == 5 |
||
900 | sub w_reg, %4 |
||
901 | test w_reg, %5 |
||
902 | jnz .%1_%4_px_loop |
||
903 | %endif |
||
904 | .%1_skip_%4_px: |
||
905 | %endmacro |
||
906 | |||
907 | %macro V_COPY_ROW 3 |
||
908 | %ifidn %1, bottom |
||
909 | sub r1, linesize |
||
910 | %endif |
||
911 | .%1_copy_loop: |
||
912 | xor cnt_reg, cnt_reg |
||
913 | %ifidn %3, mmx |
||
914 | %define linesize r2m |
||
915 | V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8 |
||
916 | %else ; !mmx |
||
917 | V_COPY_NPX %1, xmm0, movdqu, 16, 0xFFFFFFF0 |
||
918 | %ifdef ARCH_X86_64 |
||
919 | %define linesize r2 |
||
920 | V_COPY_NPX %1, rax , mov, 8 |
||
921 | %else ; ARCH_X86_32 |
||
922 | %define linesize r2m |
||
923 | V_COPY_NPX %1, mm0, movq, 8 |
||
924 | %endif ; ARCH_X86_64/32 |
||
925 | %endif ; mmx |
||
926 | V_COPY_NPX %1, vald, mov, 4 |
||
927 | V_COPY_NPX %1, valw, mov, 2 |
||
928 | V_COPY_NPX %1, vall, mov, 1 |
||
929 | mov w_reg, cnt_reg |
||
930 | %ifidn %1, body |
||
931 | add r1, linesize |
||
932 | %endif |
||
933 | add r0, linesize |
||
934 | dec %2 |
||
935 | jnz .%1_copy_loop |
||
936 | %endmacro |
||
937 | |||
938 | %macro SLOW_V_EXTEND 1 |
||
939 | .slow_v_extend_loop: |
||
940 | ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h |
||
941 | ; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x |
||
942 | %ifdef ARCH_X86_64 |
||
943 | push r11 ; save old value of block_h |
||
944 | test r3, r3 |
||
945 | %define cnt_reg r11 |
||
946 | jz .do_body_copy ; if (!start_y) goto do_body_copy |
||
947 | V_COPY_ROW top, r3, %1 |
||
948 | %else |
||
949 | cmp dword r3m, 0 |
||
950 | %define cnt_reg r2 |
||
951 | je .do_body_copy ; if (!start_y) goto do_body_copy |
||
952 | V_COPY_ROW top, dword r3m, %1 |
||
953 | %endif |
||
954 | |||
955 | .do_body_copy: |
||
956 | V_COPY_ROW body, r4, %1 |
||
957 | |||
958 | %ifdef ARCH_X86_64 |
||
959 | pop r11 ; restore old value of block_h |
||
960 | %define cnt_reg r3 |
||
961 | %endif |
||
962 | test r5, r5 |
||
963 | %ifdef ARCH_X86_64 |
||
964 | jz .v_extend_end |
||
965 | %else |
||
966 | jz .skip_bottom_extend |
||
967 | %endif |
||
968 | V_COPY_ROW bottom, r5, %1 |
||
969 | %ifdef ARCH_X86_32 |
||
970 | .skip_bottom_extend: |
||
971 | mov r2, r2m |
||
972 | %endif |
||
973 | jmp .v_extend_end |
||
974 | %endmacro |
||
975 | |||
976 | %macro SLOW_LEFT_EXTEND 1 |
||
977 | .slow_left_extend_loop: |
||
978 | ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x |
||
979 | mov r4, 8 |
||
980 | sub r0, linesize |
||
981 | READ_V_PIXEL 8, [r0+w_reg], %1 |
||
982 | .left_extend_8px_loop: |
||
983 | movq [r0+r4-8], mm0 |
||
984 | add r4, 8 |
||
985 | cmp r4, w_reg |
||
986 | jle .left_extend_8px_loop |
||
987 | sub r4, 8 |
||
988 | cmp r4, w_reg |
||
989 | jge .left_extend_loop_end |
||
990 | .left_extend_2px_loop: |
||
991 | mov [r0+r4], valw |
||
992 | add r4, 2 |
||
993 | cmp r4, w_reg |
||
994 | jl .left_extend_2px_loop |
||
995 | .left_extend_loop_end: |
||
996 | dec r5 |
||
997 | jnz .slow_left_extend_loop |
||
998 | %ifdef ARCH_X86_32 |
||
999 | mov r2, r2m |
||
1000 | %endif |
||
1001 | jmp .right_extend |
||
1002 | %endmacro |
||
1003 | |||
1004 | %macro SLOW_RIGHT_EXTEND 1 |
||
1005 | .slow_right_extend_loop: |
||
1006 | ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h, |
||
1007 | ; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr |
||
1008 | %ifdef ARCH_X86_64 |
||
1009 | %define buf_reg r3 |
||
1010 | %define bh_reg r11 |
||
1011 | %else |
||
1012 | %define buf_reg r0 |
||
1013 | %define bh_reg r5 |
||
1014 | %endif |
||
1015 | lea r1, [r4-8] |
||
1016 | sub buf_reg, linesize |
||
1017 | READ_V_PIXEL 8, [buf_reg+w_reg-1], %1 |
||
1018 | .right_extend_8px_loop: |
||
1019 | movq [buf_reg+r1], mm0 |
||
1020 | sub r1, 8 |
||
1021 | cmp r1, w_reg |
||
1022 | jge .right_extend_8px_loop |
||
1023 | add r1, 8 |
||
1024 | cmp r1, w_reg |
||
1025 | je .right_extend_loop_end |
||
1026 | .right_extend_2px_loop: |
||
1027 | sub r1, 2 |
||
1028 | mov [buf_reg+r1], valw |
||
1029 | cmp r1, w_reg |
||
1030 | jg .right_extend_2px_loop |
||
1031 | .right_extend_loop_end: |
||
1032 | dec bh_reg |
||
1033 | jnz .slow_right_extend_loop |
||
1034 | jmp .h_extend_end |
||
1035 | %endmacro |
||
1036 | |||
1037 | %macro emu_edge 1 |
||
1038 | EMU_EDGE_FUNC %1 |
||
1039 | VERTICAL_EXTEND %1 |
||
1040 | LEFT_EXTEND %1 |
||
1041 | RIGHT_EXTEND %1 |
||
1042 | SLOW_V_EXTEND %1 |
||
1043 | SLOW_LEFT_EXTEND %1 |
||
1044 | SLOW_RIGHT_EXTEND %1 |
||
1045 | %endmacro |
||
1046 | |||
1047 | emu_edge sse |
||
1048 | %ifdef ARCH_X86_32 |
||
1049 | emu_edge mmx |
||
1050 | %endif |