ffmpeg / libavcodec / i386 / dsputil_mmx_rnd_template.c @ 2600f8c8
History | View | Annotate | Download (22.6 KB)
1 |
/*
|
---|---|
2 |
* DSP utils mmx functions are compiled twice for rnd/no_rnd
|
3 |
* Copyright (c) 2000, 2001 Fabrice Bellard.
|
4 |
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
|
5 |
*
|
6 |
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
7 |
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
|
8 |
* and improved by Zdenek Kabelac <kabi@users.sf.net>
|
9 |
*
|
10 |
* This file is part of FFmpeg.
|
11 |
*
|
12 |
* FFmpeg is free software; you can redistribute it and/or
|
13 |
* modify it under the terms of the GNU Lesser General Public
|
14 |
* License as published by the Free Software Foundation; either
|
15 |
* version 2.1 of the License, or (at your option) any later version.
|
16 |
*
|
17 |
* FFmpeg is distributed in the hope that it will be useful,
|
18 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
19 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
20 |
* Lesser General Public License for more details.
|
21 |
*
|
22 |
* You should have received a copy of the GNU Lesser General Public
|
23 |
* License along with FFmpeg; if not, write to the Free Software
|
24 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
25 |
*/
|
26 |
|
27 |
// put_pixels
|
28 |
static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
29 |
{ |
30 |
MOVQ_BFE(mm6); |
31 |
__asm__ volatile(
|
32 |
"lea (%3, %3), %%"REG_a" \n\t" |
33 |
ASMALIGN(3)
|
34 |
"1: \n\t"
|
35 |
"movq (%1), %%mm0 \n\t"
|
36 |
"movq 1(%1), %%mm1 \n\t"
|
37 |
"movq (%1, %3), %%mm2 \n\t"
|
38 |
"movq 1(%1, %3), %%mm3 \n\t"
|
39 |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
40 |
"movq %%mm4, (%2) \n\t"
|
41 |
"movq %%mm5, (%2, %3) \n\t"
|
42 |
"add %%"REG_a", %1 \n\t" |
43 |
"add %%"REG_a", %2 \n\t" |
44 |
"movq (%1), %%mm0 \n\t"
|
45 |
"movq 1(%1), %%mm1 \n\t"
|
46 |
"movq (%1, %3), %%mm2 \n\t"
|
47 |
"movq 1(%1, %3), %%mm3 \n\t"
|
48 |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
49 |
"movq %%mm4, (%2) \n\t"
|
50 |
"movq %%mm5, (%2, %3) \n\t"
|
51 |
"add %%"REG_a", %1 \n\t" |
52 |
"add %%"REG_a", %2 \n\t" |
53 |
"subl $4, %0 \n\t"
|
54 |
"jnz 1b \n\t"
|
55 |
:"+g"(h), "+S"(pixels), "+D"(block) |
56 |
:"r"((x86_reg)line_size)
|
57 |
:REG_a, "memory");
|
58 |
} |
59 |
|
60 |
static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
61 |
{ |
62 |
MOVQ_BFE(mm6); |
63 |
__asm__ volatile(
|
64 |
"testl $1, %0 \n\t"
|
65 |
" jz 1f \n\t"
|
66 |
"movq (%1), %%mm0 \n\t"
|
67 |
"movq (%2), %%mm1 \n\t"
|
68 |
"add %4, %1 \n\t"
|
69 |
"add $8, %2 \n\t"
|
70 |
PAVGB(%%mm0, %%mm1, %%mm4, %%mm6) |
71 |
"movq %%mm4, (%3) \n\t"
|
72 |
"add %5, %3 \n\t"
|
73 |
"decl %0 \n\t"
|
74 |
ASMALIGN(3)
|
75 |
"1: \n\t"
|
76 |
"movq (%1), %%mm0 \n\t"
|
77 |
"movq (%2), %%mm1 \n\t"
|
78 |
"add %4, %1 \n\t"
|
79 |
"movq (%1), %%mm2 \n\t"
|
80 |
"movq 8(%2), %%mm3 \n\t"
|
81 |
"add %4, %1 \n\t"
|
82 |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
83 |
"movq %%mm4, (%3) \n\t"
|
84 |
"add %5, %3 \n\t"
|
85 |
"movq %%mm5, (%3) \n\t"
|
86 |
"add %5, %3 \n\t"
|
87 |
"movq (%1), %%mm0 \n\t"
|
88 |
"movq 16(%2), %%mm1 \n\t"
|
89 |
"add %4, %1 \n\t"
|
90 |
"movq (%1), %%mm2 \n\t"
|
91 |
"movq 24(%2), %%mm3 \n\t"
|
92 |
"add %4, %1 \n\t"
|
93 |
"add $32, %2 \n\t"
|
94 |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
95 |
"movq %%mm4, (%3) \n\t"
|
96 |
"add %5, %3 \n\t"
|
97 |
"movq %%mm5, (%3) \n\t"
|
98 |
"add %5, %3 \n\t"
|
99 |
"subl $4, %0 \n\t"
|
100 |
"jnz 1b \n\t"
|
101 |
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
102 |
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
103 |
#else
|
104 |
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
105 |
#endif
|
106 |
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) |
107 |
:"memory");
|
108 |
} |
109 |
|
110 |
static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
111 |
{ |
112 |
MOVQ_BFE(mm6); |
113 |
__asm__ volatile(
|
114 |
"lea (%3, %3), %%"REG_a" \n\t" |
115 |
ASMALIGN(3)
|
116 |
"1: \n\t"
|
117 |
"movq (%1), %%mm0 \n\t"
|
118 |
"movq 1(%1), %%mm1 \n\t"
|
119 |
"movq (%1, %3), %%mm2 \n\t"
|
120 |
"movq 1(%1, %3), %%mm3 \n\t"
|
121 |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
122 |
"movq %%mm4, (%2) \n\t"
|
123 |
"movq %%mm5, (%2, %3) \n\t"
|
124 |
"movq 8(%1), %%mm0 \n\t"
|
125 |
"movq 9(%1), %%mm1 \n\t"
|
126 |
"movq 8(%1, %3), %%mm2 \n\t"
|
127 |
"movq 9(%1, %3), %%mm3 \n\t"
|
128 |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
129 |
"movq %%mm4, 8(%2) \n\t"
|
130 |
"movq %%mm5, 8(%2, %3) \n\t"
|
131 |
"add %%"REG_a", %1 \n\t" |
132 |
"add %%"REG_a", %2 \n\t" |
133 |
"movq (%1), %%mm0 \n\t"
|
134 |
"movq 1(%1), %%mm1 \n\t"
|
135 |
"movq (%1, %3), %%mm2 \n\t"
|
136 |
"movq 1(%1, %3), %%mm3 \n\t"
|
137 |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
138 |
"movq %%mm4, (%2) \n\t"
|
139 |
"movq %%mm5, (%2, %3) \n\t"
|
140 |
"movq 8(%1), %%mm0 \n\t"
|
141 |
"movq 9(%1), %%mm1 \n\t"
|
142 |
"movq 8(%1, %3), %%mm2 \n\t"
|
143 |
"movq 9(%1, %3), %%mm3 \n\t"
|
144 |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
145 |
"movq %%mm4, 8(%2) \n\t"
|
146 |
"movq %%mm5, 8(%2, %3) \n\t"
|
147 |
"add %%"REG_a", %1 \n\t" |
148 |
"add %%"REG_a", %2 \n\t" |
149 |
"subl $4, %0 \n\t"
|
150 |
"jnz 1b \n\t"
|
151 |
:"+g"(h), "+S"(pixels), "+D"(block) |
152 |
:"r"((x86_reg)line_size)
|
153 |
:REG_a, "memory");
|
154 |
} |
155 |
|
156 |
static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
157 |
{ |
158 |
MOVQ_BFE(mm6); |
159 |
__asm__ volatile(
|
160 |
"testl $1, %0 \n\t"
|
161 |
" jz 1f \n\t"
|
162 |
"movq (%1), %%mm0 \n\t"
|
163 |
"movq (%2), %%mm1 \n\t"
|
164 |
"movq 8(%1), %%mm2 \n\t"
|
165 |
"movq 8(%2), %%mm3 \n\t"
|
166 |
"add %4, %1 \n\t"
|
167 |
"add $16, %2 \n\t"
|
168 |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
169 |
"movq %%mm4, (%3) \n\t"
|
170 |
"movq %%mm5, 8(%3) \n\t"
|
171 |
"add %5, %3 \n\t"
|
172 |
"decl %0 \n\t"
|
173 |
ASMALIGN(3)
|
174 |
"1: \n\t"
|
175 |
"movq (%1), %%mm0 \n\t"
|
176 |
"movq (%2), %%mm1 \n\t"
|
177 |
"movq 8(%1), %%mm2 \n\t"
|
178 |
"movq 8(%2), %%mm3 \n\t"
|
179 |
"add %4, %1 \n\t"
|
180 |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
181 |
"movq %%mm4, (%3) \n\t"
|
182 |
"movq %%mm5, 8(%3) \n\t"
|
183 |
"add %5, %3 \n\t"
|
184 |
"movq (%1), %%mm0 \n\t"
|
185 |
"movq 16(%2), %%mm1 \n\t"
|
186 |
"movq 8(%1), %%mm2 \n\t"
|
187 |
"movq 24(%2), %%mm3 \n\t"
|
188 |
"add %4, %1 \n\t"
|
189 |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
190 |
"movq %%mm4, (%3) \n\t"
|
191 |
"movq %%mm5, 8(%3) \n\t"
|
192 |
"add %5, %3 \n\t"
|
193 |
"add $32, %2 \n\t"
|
194 |
"subl $2, %0 \n\t"
|
195 |
"jnz 1b \n\t"
|
196 |
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
197 |
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
198 |
#else
|
199 |
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
200 |
#endif
|
201 |
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) |
202 |
:"memory");
|
203 |
} |
204 |
|
205 |
static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
206 |
{ |
207 |
MOVQ_BFE(mm6); |
208 |
__asm__ volatile(
|
209 |
"lea (%3, %3), %%"REG_a" \n\t" |
210 |
"movq (%1), %%mm0 \n\t"
|
211 |
ASMALIGN(3)
|
212 |
"1: \n\t"
|
213 |
"movq (%1, %3), %%mm1 \n\t"
|
214 |
"movq (%1, %%"REG_a"),%%mm2 \n\t" |
215 |
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) |
216 |
"movq %%mm4, (%2) \n\t"
|
217 |
"movq %%mm5, (%2, %3) \n\t"
|
218 |
"add %%"REG_a", %1 \n\t" |
219 |
"add %%"REG_a", %2 \n\t" |
220 |
"movq (%1, %3), %%mm1 \n\t"
|
221 |
"movq (%1, %%"REG_a"),%%mm0 \n\t" |
222 |
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) |
223 |
"movq %%mm4, (%2) \n\t"
|
224 |
"movq %%mm5, (%2, %3) \n\t"
|
225 |
"add %%"REG_a", %1 \n\t" |
226 |
"add %%"REG_a", %2 \n\t" |
227 |
"subl $4, %0 \n\t"
|
228 |
"jnz 1b \n\t"
|
229 |
:"+g"(h), "+S"(pixels), "+D"(block) |
230 |
:"r"((x86_reg)line_size)
|
231 |
:REG_a, "memory");
|
232 |
} |
233 |
|
234 |
static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
235 |
{ |
236 |
MOVQ_ZERO(mm7); |
237 |
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
|
238 |
__asm__ volatile(
|
239 |
"movq (%1), %%mm0 \n\t"
|
240 |
"movq 1(%1), %%mm4 \n\t"
|
241 |
"movq %%mm0, %%mm1 \n\t"
|
242 |
"movq %%mm4, %%mm5 \n\t"
|
243 |
"punpcklbw %%mm7, %%mm0 \n\t"
|
244 |
"punpcklbw %%mm7, %%mm4 \n\t"
|
245 |
"punpckhbw %%mm7, %%mm1 \n\t"
|
246 |
"punpckhbw %%mm7, %%mm5 \n\t"
|
247 |
"paddusw %%mm0, %%mm4 \n\t"
|
248 |
"paddusw %%mm1, %%mm5 \n\t"
|
249 |
"xor %%"REG_a", %%"REG_a" \n\t" |
250 |
"add %3, %1 \n\t"
|
251 |
ASMALIGN(3)
|
252 |
"1: \n\t"
|
253 |
"movq (%1, %%"REG_a"), %%mm0 \n\t" |
254 |
"movq 1(%1, %%"REG_a"), %%mm2 \n\t" |
255 |
"movq %%mm0, %%mm1 \n\t"
|
256 |
"movq %%mm2, %%mm3 \n\t"
|
257 |
"punpcklbw %%mm7, %%mm0 \n\t"
|
258 |
"punpcklbw %%mm7, %%mm2 \n\t"
|
259 |
"punpckhbw %%mm7, %%mm1 \n\t"
|
260 |
"punpckhbw %%mm7, %%mm3 \n\t"
|
261 |
"paddusw %%mm2, %%mm0 \n\t"
|
262 |
"paddusw %%mm3, %%mm1 \n\t"
|
263 |
"paddusw %%mm6, %%mm4 \n\t"
|
264 |
"paddusw %%mm6, %%mm5 \n\t"
|
265 |
"paddusw %%mm0, %%mm4 \n\t"
|
266 |
"paddusw %%mm1, %%mm5 \n\t"
|
267 |
"psrlw $2, %%mm4 \n\t"
|
268 |
"psrlw $2, %%mm5 \n\t"
|
269 |
"packuswb %%mm5, %%mm4 \n\t"
|
270 |
"movq %%mm4, (%2, %%"REG_a") \n\t" |
271 |
"add %3, %%"REG_a" \n\t" |
272 |
|
273 |
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 |
274 |
"movq 1(%1, %%"REG_a"), %%mm4 \n\t" |
275 |
"movq %%mm2, %%mm3 \n\t"
|
276 |
"movq %%mm4, %%mm5 \n\t"
|
277 |
"punpcklbw %%mm7, %%mm2 \n\t"
|
278 |
"punpcklbw %%mm7, %%mm4 \n\t"
|
279 |
"punpckhbw %%mm7, %%mm3 \n\t"
|
280 |
"punpckhbw %%mm7, %%mm5 \n\t"
|
281 |
"paddusw %%mm2, %%mm4 \n\t"
|
282 |
"paddusw %%mm3, %%mm5 \n\t"
|
283 |
"paddusw %%mm6, %%mm0 \n\t"
|
284 |
"paddusw %%mm6, %%mm1 \n\t"
|
285 |
"paddusw %%mm4, %%mm0 \n\t"
|
286 |
"paddusw %%mm5, %%mm1 \n\t"
|
287 |
"psrlw $2, %%mm0 \n\t"
|
288 |
"psrlw $2, %%mm1 \n\t"
|
289 |
"packuswb %%mm1, %%mm0 \n\t"
|
290 |
"movq %%mm0, (%2, %%"REG_a") \n\t" |
291 |
"add %3, %%"REG_a" \n\t" |
292 |
|
293 |
"subl $2, %0 \n\t"
|
294 |
"jnz 1b \n\t"
|
295 |
:"+g"(h), "+S"(pixels) |
296 |
:"D"(block), "r"((x86_reg)line_size) |
297 |
:REG_a, "memory");
|
298 |
} |
299 |
|
300 |
// avg_pixels
|
301 |
static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
302 |
{ |
303 |
MOVQ_BFE(mm6); |
304 |
JUMPALIGN(); |
305 |
do {
|
306 |
__asm__ volatile(
|
307 |
"movd %0, %%mm0 \n\t"
|
308 |
"movd %1, %%mm1 \n\t"
|
309 |
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
310 |
"movd %%mm2, %0 \n\t"
|
311 |
:"+m"(*block)
|
312 |
:"m"(*pixels)
|
313 |
:"memory");
|
314 |
pixels += line_size; |
315 |
block += line_size; |
316 |
} |
317 |
while (--h);
|
318 |
} |
319 |
|
320 |
// in case more speed is needed - unroling would certainly help
|
321 |
static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
322 |
{ |
323 |
MOVQ_BFE(mm6); |
324 |
JUMPALIGN(); |
325 |
do {
|
326 |
__asm__ volatile(
|
327 |
"movq %0, %%mm0 \n\t"
|
328 |
"movq %1, %%mm1 \n\t"
|
329 |
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
330 |
"movq %%mm2, %0 \n\t"
|
331 |
:"+m"(*block)
|
332 |
:"m"(*pixels)
|
333 |
:"memory");
|
334 |
pixels += line_size; |
335 |
block += line_size; |
336 |
} |
337 |
while (--h);
|
338 |
} |
339 |
|
340 |
static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
341 |
{ |
342 |
MOVQ_BFE(mm6); |
343 |
JUMPALIGN(); |
344 |
do {
|
345 |
__asm__ volatile(
|
346 |
"movq %0, %%mm0 \n\t"
|
347 |
"movq %1, %%mm1 \n\t"
|
348 |
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
349 |
"movq %%mm2, %0 \n\t"
|
350 |
"movq 8%0, %%mm0 \n\t"
|
351 |
"movq 8%1, %%mm1 \n\t"
|
352 |
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
353 |
"movq %%mm2, 8%0 \n\t"
|
354 |
:"+m"(*block)
|
355 |
:"m"(*pixels)
|
356 |
:"memory");
|
357 |
pixels += line_size; |
358 |
block += line_size; |
359 |
} |
360 |
while (--h);
|
361 |
} |
362 |
|
363 |
static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
364 |
{ |
365 |
MOVQ_BFE(mm6); |
366 |
JUMPALIGN(); |
367 |
do {
|
368 |
__asm__ volatile(
|
369 |
"movq %1, %%mm0 \n\t"
|
370 |
"movq 1%1, %%mm1 \n\t"
|
371 |
"movq %0, %%mm3 \n\t"
|
372 |
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
373 |
PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) |
374 |
"movq %%mm0, %0 \n\t"
|
375 |
:"+m"(*block)
|
376 |
:"m"(*pixels)
|
377 |
:"memory");
|
378 |
pixels += line_size; |
379 |
block += line_size; |
380 |
} while (--h);
|
381 |
} |
382 |
|
383 |
static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
384 |
{ |
385 |
MOVQ_BFE(mm6); |
386 |
JUMPALIGN(); |
387 |
do {
|
388 |
__asm__ volatile(
|
389 |
"movq %1, %%mm0 \n\t"
|
390 |
"movq %2, %%mm1 \n\t"
|
391 |
"movq %0, %%mm3 \n\t"
|
392 |
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
393 |
PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) |
394 |
"movq %%mm0, %0 \n\t"
|
395 |
:"+m"(*dst)
|
396 |
:"m"(*src1), "m"(*src2) |
397 |
:"memory");
|
398 |
dst += dstStride; |
399 |
src1 += src1Stride; |
400 |
src2 += 8;
|
401 |
} while (--h);
|
402 |
} |
403 |
|
404 |
static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
405 |
{ |
406 |
MOVQ_BFE(mm6); |
407 |
JUMPALIGN(); |
408 |
do {
|
409 |
__asm__ volatile(
|
410 |
"movq %1, %%mm0 \n\t"
|
411 |
"movq 1%1, %%mm1 \n\t"
|
412 |
"movq %0, %%mm3 \n\t"
|
413 |
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
414 |
PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) |
415 |
"movq %%mm0, %0 \n\t"
|
416 |
"movq 8%1, %%mm0 \n\t"
|
417 |
"movq 9%1, %%mm1 \n\t"
|
418 |
"movq 8%0, %%mm3 \n\t"
|
419 |
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
420 |
PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) |
421 |
"movq %%mm0, 8%0 \n\t"
|
422 |
:"+m"(*block)
|
423 |
:"m"(*pixels)
|
424 |
:"memory");
|
425 |
pixels += line_size; |
426 |
block += line_size; |
427 |
} while (--h);
|
428 |
} |
429 |
|
430 |
static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
431 |
{ |
432 |
MOVQ_BFE(mm6); |
433 |
JUMPALIGN(); |
434 |
do {
|
435 |
__asm__ volatile(
|
436 |
"movq %1, %%mm0 \n\t"
|
437 |
"movq %2, %%mm1 \n\t"
|
438 |
"movq %0, %%mm3 \n\t"
|
439 |
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
440 |
PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) |
441 |
"movq %%mm0, %0 \n\t"
|
442 |
"movq 8%1, %%mm0 \n\t"
|
443 |
"movq 8%2, %%mm1 \n\t"
|
444 |
"movq 8%0, %%mm3 \n\t"
|
445 |
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
446 |
PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) |
447 |
"movq %%mm0, 8%0 \n\t"
|
448 |
:"+m"(*dst)
|
449 |
:"m"(*src1), "m"(*src2) |
450 |
:"memory");
|
451 |
dst += dstStride; |
452 |
src1 += src1Stride; |
453 |
src2 += 16;
|
454 |
} while (--h);
|
455 |
} |
456 |
|
457 |
static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
458 |
{ |
459 |
MOVQ_BFE(mm6); |
460 |
__asm__ volatile(
|
461 |
"lea (%3, %3), %%"REG_a" \n\t" |
462 |
"movq (%1), %%mm0 \n\t"
|
463 |
ASMALIGN(3)
|
464 |
"1: \n\t"
|
465 |
"movq (%1, %3), %%mm1 \n\t"
|
466 |
"movq (%1, %%"REG_a"), %%mm2 \n\t" |
467 |
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) |
468 |
"movq (%2), %%mm3 \n\t"
|
469 |
PAVGB(%%mm3, %%mm4, %%mm0, %%mm6) |
470 |
"movq (%2, %3), %%mm3 \n\t"
|
471 |
PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) |
472 |
"movq %%mm0, (%2) \n\t"
|
473 |
"movq %%mm1, (%2, %3) \n\t"
|
474 |
"add %%"REG_a", %1 \n\t" |
475 |
"add %%"REG_a", %2 \n\t" |
476 |
|
477 |
"movq (%1, %3), %%mm1 \n\t"
|
478 |
"movq (%1, %%"REG_a"), %%mm0 \n\t" |
479 |
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) |
480 |
"movq (%2), %%mm3 \n\t"
|
481 |
PAVGB(%%mm3, %%mm4, %%mm2, %%mm6) |
482 |
"movq (%2, %3), %%mm3 \n\t"
|
483 |
PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) |
484 |
"movq %%mm2, (%2) \n\t"
|
485 |
"movq %%mm1, (%2, %3) \n\t"
|
486 |
"add %%"REG_a", %1 \n\t" |
487 |
"add %%"REG_a", %2 \n\t" |
488 |
|
489 |
"subl $4, %0 \n\t"
|
490 |
"jnz 1b \n\t"
|
491 |
:"+g"(h), "+S"(pixels), "+D"(block) |
492 |
:"r"((x86_reg)line_size)
|
493 |
:REG_a, "memory");
|
494 |
} |
495 |
|
496 |
// this routine is 'slightly' suboptimal but mostly unused
|
497 |
static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
498 |
{ |
499 |
MOVQ_ZERO(mm7); |
500 |
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
|
501 |
__asm__ volatile(
|
502 |
"movq (%1), %%mm0 \n\t"
|
503 |
"movq 1(%1), %%mm4 \n\t"
|
504 |
"movq %%mm0, %%mm1 \n\t"
|
505 |
"movq %%mm4, %%mm5 \n\t"
|
506 |
"punpcklbw %%mm7, %%mm0 \n\t"
|
507 |
"punpcklbw %%mm7, %%mm4 \n\t"
|
508 |
"punpckhbw %%mm7, %%mm1 \n\t"
|
509 |
"punpckhbw %%mm7, %%mm5 \n\t"
|
510 |
"paddusw %%mm0, %%mm4 \n\t"
|
511 |
"paddusw %%mm1, %%mm5 \n\t"
|
512 |
"xor %%"REG_a", %%"REG_a" \n\t" |
513 |
"add %3, %1 \n\t"
|
514 |
ASMALIGN(3)
|
515 |
"1: \n\t"
|
516 |
"movq (%1, %%"REG_a"), %%mm0 \n\t" |
517 |
"movq 1(%1, %%"REG_a"), %%mm2 \n\t" |
518 |
"movq %%mm0, %%mm1 \n\t"
|
519 |
"movq %%mm2, %%mm3 \n\t"
|
520 |
"punpcklbw %%mm7, %%mm0 \n\t"
|
521 |
"punpcklbw %%mm7, %%mm2 \n\t"
|
522 |
"punpckhbw %%mm7, %%mm1 \n\t"
|
523 |
"punpckhbw %%mm7, %%mm3 \n\t"
|
524 |
"paddusw %%mm2, %%mm0 \n\t"
|
525 |
"paddusw %%mm3, %%mm1 \n\t"
|
526 |
"paddusw %%mm6, %%mm4 \n\t"
|
527 |
"paddusw %%mm6, %%mm5 \n\t"
|
528 |
"paddusw %%mm0, %%mm4 \n\t"
|
529 |
"paddusw %%mm1, %%mm5 \n\t"
|
530 |
"psrlw $2, %%mm4 \n\t"
|
531 |
"psrlw $2, %%mm5 \n\t"
|
532 |
"movq (%2, %%"REG_a"), %%mm3 \n\t" |
533 |
"packuswb %%mm5, %%mm4 \n\t"
|
534 |
"pcmpeqd %%mm2, %%mm2 \n\t"
|
535 |
"paddb %%mm2, %%mm2 \n\t"
|
536 |
PAVGB(%%mm3, %%mm4, %%mm5, %%mm2) |
537 |
"movq %%mm5, (%2, %%"REG_a") \n\t" |
538 |
"add %3, %%"REG_a" \n\t" |
539 |
|
540 |
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 |
541 |
"movq 1(%1, %%"REG_a"), %%mm4 \n\t" |
542 |
"movq %%mm2, %%mm3 \n\t"
|
543 |
"movq %%mm4, %%mm5 \n\t"
|
544 |
"punpcklbw %%mm7, %%mm2 \n\t"
|
545 |
"punpcklbw %%mm7, %%mm4 \n\t"
|
546 |
"punpckhbw %%mm7, %%mm3 \n\t"
|
547 |
"punpckhbw %%mm7, %%mm5 \n\t"
|
548 |
"paddusw %%mm2, %%mm4 \n\t"
|
549 |
"paddusw %%mm3, %%mm5 \n\t"
|
550 |
"paddusw %%mm6, %%mm0 \n\t"
|
551 |
"paddusw %%mm6, %%mm1 \n\t"
|
552 |
"paddusw %%mm4, %%mm0 \n\t"
|
553 |
"paddusw %%mm5, %%mm1 \n\t"
|
554 |
"psrlw $2, %%mm0 \n\t"
|
555 |
"psrlw $2, %%mm1 \n\t"
|
556 |
"movq (%2, %%"REG_a"), %%mm3 \n\t" |
557 |
"packuswb %%mm1, %%mm0 \n\t"
|
558 |
"pcmpeqd %%mm2, %%mm2 \n\t"
|
559 |
"paddb %%mm2, %%mm2 \n\t"
|
560 |
PAVGB(%%mm3, %%mm0, %%mm1, %%mm2) |
561 |
"movq %%mm1, (%2, %%"REG_a") \n\t" |
562 |
"add %3, %%"REG_a" \n\t" |
563 |
|
564 |
"subl $2, %0 \n\t"
|
565 |
"jnz 1b \n\t"
|
566 |
:"+g"(h), "+S"(pixels) |
567 |
:"D"(block), "r"((x86_reg)line_size) |
568 |
:REG_a, "memory");
|
569 |
} |
570 |
|
571 |
//FIXME optimize
|
572 |
static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
573 |
DEF(put, pixels8_y2)(block , pixels , line_size, h); |
574 |
DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); |
575 |
} |
576 |
|
577 |
static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
578 |
DEF(put, pixels8_xy2)(block , pixels , line_size, h); |
579 |
DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); |
580 |
} |
581 |
|
582 |
static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
583 |
DEF(avg, pixels8_y2)(block , pixels , line_size, h); |
584 |
DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h); |
585 |
} |
586 |
|
587 |
static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
588 |
DEF(avg, pixels8_xy2)(block , pixels , line_size, h); |
589 |
DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); |
590 |
} |