ffmpeg / libavcodec / alpha / dsputil_alpha.c @ 5509bffa
History | View | Annotate | Download (14.5 KB)
1 |
/*
|
---|---|
2 |
* Alpha optimized DSP utils
|
3 |
* Copyright (c) 2002 Falk Hueffner <falk@debian.org>
|
4 |
*
|
5 |
* This library is free software; you can redistribute it and/or
|
6 |
* modify it under the terms of the GNU Lesser General Public
|
7 |
* License as published by the Free Software Foundation; either
|
8 |
* version 2 of the License, or (at your option) any later version.
|
9 |
*
|
10 |
* This library is distributed in the hope that it will be useful,
|
11 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
13 |
* Lesser General Public License for more details.
|
14 |
*
|
15 |
* You should have received a copy of the GNU Lesser General Public
|
16 |
* License along with this library; if not, write to the Free Software
|
17 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
18 |
*/
|
19 |
|
20 |
#include "asm.h" |
21 |
#include "../dsputil.h" |
22 |
|
23 |
extern void simple_idct_axp(DCTELEM *block); |
24 |
extern void simple_idct_put_axp(uint8_t *dest, int line_size, DCTELEM *block); |
25 |
extern void simple_idct_add_axp(uint8_t *dest, int line_size, DCTELEM *block); |
26 |
|
27 |
void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, |
28 |
int line_size, int h); |
29 |
void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, |
30 |
int line_size);
|
31 |
void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, |
32 |
int line_size);
|
33 |
void (*put_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels, |
34 |
int line_size);
|
35 |
void (*add_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels, |
36 |
int line_size);
|
37 |
|
38 |
void get_pixels_mvi(DCTELEM *restrict block, |
39 |
const uint8_t *restrict pixels, int line_size); |
40 |
void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, |
41 |
int stride);
|
42 |
int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); |
43 |
int pix_abs16x16_mvi_asm(uint8_t *pix1, uint8_t *pix2, int line_size); |
44 |
int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); |
45 |
int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); |
46 |
int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); |
47 |
|
48 |
#if 0
|
49 |
/* These functions were the base for the optimized assembler routines,
|
50 |
and remain here for documentation purposes. */
|
51 |
static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
|
52 |
int line_size)
|
53 |
{
|
54 |
int i = 8;
|
55 |
uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
|
56 |
|
57 |
do {
|
58 |
uint64_t shorts0, shorts1;
|
59 |
|
60 |
shorts0 = ldq(block);
|
61 |
shorts0 = maxsw4(shorts0, 0);
|
62 |
shorts0 = minsw4(shorts0, clampmask);
|
63 |
stl(pkwb(shorts0), pixels);
|
64 |
|
65 |
shorts1 = ldq(block + 4);
|
66 |
shorts1 = maxsw4(shorts1, 0);
|
67 |
shorts1 = minsw4(shorts1, clampmask);
|
68 |
stl(pkwb(shorts1), pixels + 4);
|
69 |
|
70 |
pixels += line_size;
|
71 |
block += 8;
|
72 |
} while (--i);
|
73 |
}
|
74 |
|
75 |
void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
|
76 |
int line_size)
|
77 |
{
|
78 |
int h = 8;
|
79 |
/* Keep this function a leaf function by generating the constants
|
80 |
manually (mainly for the hack value ;-). */
|
81 |
uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
|
82 |
uint64_t signmask = zap(-1, 0x33);
|
83 |
signmask ^= signmask >> 1; /* 0x8000800080008000 */
|
84 |
|
85 |
do {
|
86 |
uint64_t shorts0, pix0, signs0;
|
87 |
uint64_t shorts1, pix1, signs1;
|
88 |
|
89 |
shorts0 = ldq(block);
|
90 |
shorts1 = ldq(block + 4);
|
91 |
|
92 |
pix0 = unpkbw(ldl(pixels));
|
93 |
/* Signed subword add (MMX paddw). */
|
94 |
signs0 = shorts0 & signmask;
|
95 |
shorts0 &= ~signmask;
|
96 |
shorts0 += pix0;
|
97 |
shorts0 ^= signs0;
|
98 |
/* Clamp. */
|
99 |
shorts0 = maxsw4(shorts0, 0);
|
100 |
shorts0 = minsw4(shorts0, clampmask);
|
101 |
|
102 |
/* Next 4. */
|
103 |
pix1 = unpkbw(ldl(pixels + 4));
|
104 |
signs1 = shorts1 & signmask;
|
105 |
shorts1 &= ~signmask;
|
106 |
shorts1 += pix1;
|
107 |
shorts1 ^= signs1;
|
108 |
shorts1 = maxsw4(shorts1, 0);
|
109 |
shorts1 = minsw4(shorts1, clampmask);
|
110 |
|
111 |
stl(pkwb(shorts0), pixels);
|
112 |
stl(pkwb(shorts1), pixels + 4);
|
113 |
|
114 |
pixels += line_size;
|
115 |
block += 8;
|
116 |
} while (--h);
|
117 |
}
|
118 |
#endif
|
119 |
|
120 |
static void clear_blocks_axp(DCTELEM *blocks) { |
121 |
uint64_t *p = (uint64_t *) blocks; |
122 |
int n = sizeof(DCTELEM) * 6 * 64; |
123 |
|
124 |
do {
|
125 |
p[0] = 0; |
126 |
p[1] = 0; |
127 |
p[2] = 0; |
128 |
p[3] = 0; |
129 |
p[4] = 0; |
130 |
p[5] = 0; |
131 |
p[6] = 0; |
132 |
p[7] = 0; |
133 |
p += 8;
|
134 |
n -= 8 * 8; |
135 |
} while (n);
|
136 |
} |
137 |
|
138 |
static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) |
139 |
{ |
140 |
return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); |
141 |
} |
142 |
|
143 |
static inline uint64_t avg2(uint64_t a, uint64_t b) |
144 |
{ |
145 |
return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); |
146 |
} |
147 |
|
148 |
#if 0
|
149 |
/* The XY2 routines basically utilize this scheme, but reuse parts in
|
150 |
each iteration. */
|
151 |
static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
|
152 |
{
|
153 |
uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
|
154 |
+ ((l2 & ~BYTE_VEC(0x03)) >> 2)
|
155 |
+ ((l3 & ~BYTE_VEC(0x03)) >> 2)
|
156 |
+ ((l4 & ~BYTE_VEC(0x03)) >> 2);
|
157 |
uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
|
158 |
+ (l2 & BYTE_VEC(0x03))
|
159 |
+ (l3 & BYTE_VEC(0x03))
|
160 |
+ (l4 & BYTE_VEC(0x03))
|
161 |
+ BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
|
162 |
return r1 + r2;
|
163 |
}
|
164 |
#endif
|
165 |
|
166 |
#define OP(LOAD, STORE) \
|
167 |
do { \
|
168 |
STORE(LOAD(pixels), block); \ |
169 |
pixels += line_size; \ |
170 |
block += line_size; \ |
171 |
} while (--h)
|
172 |
|
173 |
#define OP_X2(LOAD, STORE) \
|
174 |
do { \
|
175 |
uint64_t pix1, pix2; \ |
176 |
\ |
177 |
pix1 = LOAD(pixels); \ |
178 |
pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ |
179 |
STORE(AVG2(pix1, pix2), block); \ |
180 |
pixels += line_size; \ |
181 |
block += line_size; \ |
182 |
} while (--h)
|
183 |
|
184 |
#define OP_Y2(LOAD, STORE) \
|
185 |
do { \
|
186 |
uint64_t pix = LOAD(pixels); \ |
187 |
do { \
|
188 |
uint64_t next_pix; \ |
189 |
\ |
190 |
pixels += line_size; \ |
191 |
next_pix = LOAD(pixels); \ |
192 |
STORE(AVG2(pix, next_pix), block); \ |
193 |
block += line_size; \ |
194 |
pix = next_pix; \ |
195 |
} while (--h); \
|
196 |
} while (0) |
197 |
|
198 |
#define OP_XY2(LOAD, STORE) \
|
199 |
do { \
|
200 |
uint64_t pix1 = LOAD(pixels); \ |
201 |
uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ |
202 |
uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \
|
203 |
+ (pix2 & BYTE_VEC(0x03)); \
|
204 |
uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \ |
205 |
+ ((pix2 & ~BYTE_VEC(0x03)) >> 2); \ |
206 |
\ |
207 |
do { \
|
208 |
uint64_t npix1, npix2; \ |
209 |
uint64_t npix_l, npix_h; \ |
210 |
uint64_t avg; \ |
211 |
\ |
212 |
pixels += line_size; \ |
213 |
npix1 = LOAD(pixels); \ |
214 |
npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \ |
215 |
npix_l = (npix1 & BYTE_VEC(0x03)) \
|
216 |
+ (npix2 & BYTE_VEC(0x03)); \
|
217 |
npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \ |
218 |
+ ((npix2 & ~BYTE_VEC(0x03)) >> 2); \ |
219 |
avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \ |
220 |
+ pix_h + npix_h; \ |
221 |
STORE(avg, block); \ |
222 |
\ |
223 |
block += line_size; \ |
224 |
pix_l = npix_l; \ |
225 |
pix_h = npix_h; \ |
226 |
} while (--h); \
|
227 |
} while (0) |
228 |
|
229 |
#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \
|
230 |
static void OPNAME ## _pixels ## SUFF ## _axp \ |
231 |
(uint8_t *restrict block, const uint8_t *restrict pixels, \ |
232 |
int line_size, int h) \ |
233 |
{ \ |
234 |
if ((size_t) pixels & 0x7) { \ |
235 |
OPKIND(uldq, STORE); \ |
236 |
} else { \
|
237 |
OPKIND(ldq, STORE); \ |
238 |
} \ |
239 |
} \ |
240 |
\ |
241 |
static void OPNAME ## _pixels16 ## SUFF ## _axp \ |
242 |
(uint8_t *restrict block, const uint8_t *restrict pixels, \ |
243 |
int line_size, int h) \ |
244 |
{ \ |
245 |
OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \ |
246 |
OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \ |
247 |
} |
248 |
|
249 |
#define PIXOP(OPNAME, STORE) \
|
250 |
MAKE_OP(OPNAME, , OP, STORE) \ |
251 |
MAKE_OP(OPNAME, _x2, OP_X2, STORE) \ |
252 |
MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \ |
253 |
MAKE_OP(OPNAME, _xy2, OP_XY2, STORE) |
254 |
|
255 |
/* Rounding primitives. */
|
256 |
#define AVG2 avg2
|
257 |
#define AVG4 avg4
|
258 |
#define AVG4_ROUNDER BYTE_VEC(0x02) |
259 |
#define STORE(l, b) stq(l, b)
|
260 |
PIXOP(put, STORE); |
261 |
|
262 |
#undef STORE
|
263 |
#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
|
264 |
PIXOP(avg, STORE); |
265 |
|
266 |
/* Not rounding primitives. */
|
267 |
#undef AVG2
|
268 |
#undef AVG4
|
269 |
#undef AVG4_ROUNDER
|
270 |
#undef STORE
|
271 |
#define AVG2 avg2_no_rnd
|
272 |
#define AVG4 avg4_no_rnd
|
273 |
#define AVG4_ROUNDER BYTE_VEC(0x01) |
274 |
#define STORE(l, b) stq(l, b)
|
275 |
PIXOP(put_no_rnd, STORE); |
276 |
|
277 |
#undef STORE
|
278 |
#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
|
279 |
PIXOP(avg_no_rnd, STORE); |
280 |
|
281 |
void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels, |
282 |
int line_size, int h) |
283 |
{ |
284 |
put_pixels_axp_asm(block, pixels, line_size, h); |
285 |
put_pixels_axp_asm(block + 8, pixels + 8, line_size, h); |
286 |
} |
287 |
|
288 |
static int sad16x16_mvi(void *s, uint8_t *a, uint8_t *b, int stride) |
289 |
{ |
290 |
return pix_abs16x16_mvi_asm(a, b, stride);
|
291 |
} |
292 |
|
293 |
void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx)
|
294 |
{ |
295 |
c->put_pixels_tab[0][0] = put_pixels16_axp_asm; |
296 |
c->put_pixels_tab[0][1] = put_pixels16_x2_axp; |
297 |
c->put_pixels_tab[0][2] = put_pixels16_y2_axp; |
298 |
c->put_pixels_tab[0][3] = put_pixels16_xy2_axp; |
299 |
|
300 |
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm; |
301 |
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp; |
302 |
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp; |
303 |
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp; |
304 |
|
305 |
c->avg_pixels_tab[0][0] = avg_pixels16_axp; |
306 |
c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp; |
307 |
c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp; |
308 |
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp; |
309 |
|
310 |
c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_axp; |
311 |
c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_axp; |
312 |
c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_axp; |
313 |
c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_axp; |
314 |
|
315 |
c->put_pixels_tab[1][0] = put_pixels_axp_asm; |
316 |
c->put_pixels_tab[1][1] = put_pixels_x2_axp; |
317 |
c->put_pixels_tab[1][2] = put_pixels_y2_axp; |
318 |
c->put_pixels_tab[1][3] = put_pixels_xy2_axp; |
319 |
|
320 |
c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm; |
321 |
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp; |
322 |
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp; |
323 |
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp; |
324 |
|
325 |
c->avg_pixels_tab[1][0] = avg_pixels_axp; |
326 |
c->avg_pixels_tab[1][1] = avg_pixels_x2_axp; |
327 |
c->avg_pixels_tab[1][2] = avg_pixels_y2_axp; |
328 |
c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp; |
329 |
|
330 |
c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels_axp; |
331 |
c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels_x2_axp; |
332 |
c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels_y2_axp; |
333 |
c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels_xy2_axp; |
334 |
|
335 |
c->clear_blocks = clear_blocks_axp; |
336 |
|
337 |
/* amask clears all bits that correspond to present features. */
|
338 |
if (amask(AMASK_MVI) == 0) { |
339 |
c->put_pixels_clamped = put_pixels_clamped_mvi_asm; |
340 |
c->add_pixels_clamped = add_pixels_clamped_mvi_asm; |
341 |
|
342 |
c->get_pixels = get_pixels_mvi; |
343 |
c->diff_pixels = diff_pixels_mvi; |
344 |
c->sad[0] = sad16x16_mvi;
|
345 |
c->sad[1] = pix_abs8x8_mvi;
|
346 |
// c->pix_abs[0][0] = pix_abs16x16_mvi_asm; //FIXME function arguments for the asm must be fixed
|
347 |
c->pix_abs[0][0] = sad16x16_mvi; |
348 |
c->pix_abs[1][0] = pix_abs8x8_mvi; |
349 |
c->pix_abs[0][1] = pix_abs16x16_x2_mvi; |
350 |
c->pix_abs[0][2] = pix_abs16x16_y2_mvi; |
351 |
c->pix_abs[0][3] = pix_abs16x16_xy2_mvi; |
352 |
} |
353 |
|
354 |
put_pixels_clamped_axp_p = c->put_pixels_clamped; |
355 |
add_pixels_clamped_axp_p = c->add_pixels_clamped; |
356 |
|
357 |
c->idct_put = simple_idct_put_axp; |
358 |
c->idct_add = simple_idct_add_axp; |
359 |
c->idct = simple_idct_axp; |
360 |
} |