Statistics
| Branch: | Revision:

ffmpeg / libavcodec / alpha / dsputil_alpha.c @ f5abd9fd

History | View | Annotate | Download (9.85 KB)

1
/*
2
 * Alpha optimized DSP utils
3
 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 */
19

    
20
#include "asm.h"
21
#include "../dsputil.h"
22

    
23
void simple_idct_axp(DCTELEM *block);
24

    
25
void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
26
                        int line_size, int h);
27
void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
28
                                int line_size);
29
void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
30
                                int line_size);
31

    
32
#if 0
33
/* These functions were the base for the optimized assembler routines,
34
   and remain here for documentation purposes.  */
35
static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 
36
                                   int line_size)
37
{
38
    int i = 8;
39
    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
40

41
    ASM_ACCEPT_MVI;
42

43
    do {
44
        uint64_t shorts0, shorts1;
45

46
        shorts0 = ldq(block);
47
        shorts0 = maxsw4(shorts0, 0);
48
        shorts0 = minsw4(shorts0, clampmask);
49
        stl(pkwb(shorts0), pixels);
50

51
        shorts1 = ldq(block + 4);
52
        shorts1 = maxsw4(shorts1, 0);
53
        shorts1 = minsw4(shorts1, clampmask);
54
        stl(pkwb(shorts1), pixels + 4);
55

56
        pixels += line_size;
57
        block += 8;
58
    } while (--i);
59
}
60

61
void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 
62
                            int line_size)
63
{
64
    int h = 8;
65
    /* Keep this function a leaf function by generating the constants
66
       manually (mainly for the hack value ;-).  */
67
    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
68
    uint64_t signmask  = zap(-1, 0x33);
69
    signmask ^= signmask >> 1;  /* 0x8000800080008000 */
70

71
    ASM_ACCEPT_MVI;
72

73
    do {
74
        uint64_t shorts0, pix0, signs0;
75
        uint64_t shorts1, pix1, signs1;
76

77
        shorts0 = ldq(block);
78
        shorts1 = ldq(block + 4);
79

80
        pix0    = unpkbw(ldl(pixels));
81
        /* Signed subword add (MMX paddw).  */
82
        signs0  = shorts0 & signmask;
83
        shorts0 &= ~signmask;
84
        shorts0 += pix0;
85
        shorts0 ^= signs0;
86
        /* Clamp. */
87
        shorts0 = maxsw4(shorts0, 0);
88
        shorts0 = minsw4(shorts0, clampmask);   
89

90
        /* Next 4.  */
91
        pix1    = unpkbw(ldl(pixels + 4));
92
        signs1  = shorts1 & signmask;
93
        shorts1 &= ~signmask;
94
        shorts1 += pix1;
95
        shorts1 ^= signs1;
96
        shorts1 = maxsw4(shorts1, 0);
97
        shorts1 = minsw4(shorts1, clampmask);
98

99
        stl(pkwb(shorts0), pixels);
100
        stl(pkwb(shorts1), pixels + 4);
101

102
        pixels += line_size;
103
        block += 8;
104
    } while (--h);
105
}
106
#endif
107

    
108
static void clear_blocks_axp(DCTELEM *blocks) {
109
    uint64_t *p = (uint64_t *) blocks;
110
    int n = sizeof(DCTELEM) * 6 * 64;
111

    
112
    do {
113
        p[0] = 0;
114
        p[1] = 0;
115
        p[2] = 0;
116
        p[3] = 0;
117
        p[4] = 0;
118
        p[5] = 0;
119
        p[6] = 0;
120
        p[7] = 0;
121
        p += 8;
122
        n -= 8 * 8;
123
    } while (n);
124
}
125

    
126
static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
127
{
128
    return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
129
}
130

    
131
static inline uint64_t avg2(uint64_t a, uint64_t b)
132
{
133
    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);    
134
}
135

    
136
#if 0
137
/* The XY2 routines basically utilize this scheme, but reuse parts in
138
   each iteration.  */
139
static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
140
{
141
    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
142
                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
143
                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
144
                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
145
    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
146
                    + (l2 & BYTE_VEC(0x03))
147
                    + (l3 & BYTE_VEC(0x03))
148
                    + (l4 & BYTE_VEC(0x03))
149
                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
150
    return r1 + r2;
151
}
152
#endif
153

    
154
#define OP(LOAD, STORE, INCR)                        \
155
    do {                                        \
156
        STORE(LOAD(pixels), block);                \
157
        pixels += line_size;                        \
158
        block += INCR;                                \
159
    } while (--h)
160

    
161
#define OP_X2(LOAD, STORE, INCR)                                \
162
    do {                                                        \
163
        uint64_t pix1, pix2;                                        \
164
                                                                \
165
        pix1 = LOAD(pixels);                                        \
166
        pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);        \
167
        STORE(AVG2(pix1, pix2), block);                                \
168
        pixels += line_size;                                        \
169
        block += INCR;                                                \
170
    } while (--h)
171

    
172
#define OP_Y2(LOAD, STORE, INCR)                \
173
    do {                                        \
174
        uint64_t pix = LOAD(pixels);                \
175
        do {                                        \
176
            uint64_t next_pix;                        \
177
                                                \
178
            pixels += line_size;                \
179
            next_pix = LOAD(pixels);                \
180
            STORE(AVG2(pix, next_pix), block);        \
181
            block += INCR;                        \
182
            pix = next_pix;                        \
183
        } while (--h);                                \
184
    } while (0)
185

    
186
#define OP_XY2(LOAD, STORE, INCR)                                           \
187
    do {                                                                    \
188
        uint64_t pix1 = LOAD(pixels);                                       \
189
        uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);           \
190
        uint64_t pix_l = (pix1 & BYTE_VEC(0x03))                            \
191
                       + (pix2 & BYTE_VEC(0x03));                           \
192
        uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2)                    \
193
                       + ((pix2 & ~BYTE_VEC(0x03)) >> 2);                   \
194
                                                                            \
195
        do {                                                                \
196
            uint64_t npix1, npix2;                                          \
197
            uint64_t npix_l, npix_h;                                        \
198
            uint64_t avg;                                                   \
199
                                                                            \
200
            pixels += line_size;                                            \
201
            npix1 = LOAD(pixels);                                           \
202
            npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56);              \
203
            npix_l = (npix1 & BYTE_VEC(0x03))                               \
204
                   + (npix2 & BYTE_VEC(0x03));                              \
205
            npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2)                       \
206
                   + ((npix2 & ~BYTE_VEC(0x03)) >> 2);                      \
207
            avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
208
                + pix_h + npix_h;                                           \
209
            STORE(avg, block);                                              \
210
                                                                            \
211
            block += INCR;                                                  \
212
            pix_l = npix_l;                                                 \
213
            pix_h = npix_h;                                                 \
214
        } while (--h);                                                      \
215
    } while (0)
216

    
217
#define MAKE_OP(BTYPE, OPNAME, SUFF, OPKIND, STORE, INCR)       \
218
static void OPNAME ## _pixels ## SUFF ## _axp                   \
219
        (BTYPE *restrict block, const uint8_t *restrict pixels, \
220
         int line_size, int h)                                  \
221
{                                                               \
222
    if ((size_t) pixels & 0x7) {                                \
223
        OPKIND(uldq, STORE, INCR);                              \
224
    } else {                                                    \
225
        OPKIND(ldq, STORE, INCR);                               \
226
    }                                                           \
227
}
228

    
229
#define PIXOP(BTYPE, OPNAME, STORE, INCR)                \
230
    MAKE_OP(BTYPE, OPNAME, ,         OP,         STORE, INCR);        \
231
    MAKE_OP(BTYPE, OPNAME, _x2,         OP_X2,         STORE, INCR);        \
232
    MAKE_OP(BTYPE, OPNAME, _y2,         OP_Y2,         STORE, INCR);        \
233
    MAKE_OP(BTYPE, OPNAME, _xy2, OP_XY2, STORE, INCR);
234

    
235
/* Rounding primitives.  */
236
#define AVG2 avg2
237
#define AVG4 avg4
238
#define AVG4_ROUNDER BYTE_VEC(0x02)
239
#define STORE(l, b) stq(l, b)
240
PIXOP(uint8_t, put, STORE, line_size);
241

    
242
#undef STORE
243
#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
244
PIXOP(uint8_t, avg, STORE, line_size);
245

    
246
/* Not rounding primitives.  */
247
#undef AVG2
248
#undef AVG4
249
#undef AVG4_ROUNDER
250
#undef STORE
251
#define AVG2 avg2_no_rnd
252
#define AVG4 avg4_no_rnd
253
#define AVG4_ROUNDER BYTE_VEC(0x01)
254
#define STORE(l, b) stq(l, b)
255
PIXOP(uint8_t, put_no_rnd, STORE, line_size);
256

    
257
#undef STORE
258
#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
259
PIXOP(uint8_t, avg_no_rnd, STORE, line_size);
260

    
261
void dsputil_init_alpha(void)
262
{
263
    put_pixels_tab[0] = put_pixels_axp_asm;
264
    put_pixels_tab[1] = put_pixels_x2_axp;
265
    put_pixels_tab[2] = put_pixels_y2_axp;
266
    put_pixels_tab[3] = put_pixels_xy2_axp;
267

    
268
    put_no_rnd_pixels_tab[0] = put_pixels_axp_asm;
269
    put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
270
    put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
271
    put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
272

    
273
    avg_pixels_tab[0] = avg_pixels_axp;
274
    avg_pixels_tab[1] = avg_pixels_x2_axp;
275
    avg_pixels_tab[2] = avg_pixels_y2_axp;
276
    avg_pixels_tab[3] = avg_pixels_xy2_axp;
277

    
278
    avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_axp;
279
    avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_axp;
280
    avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_axp;
281
    avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_axp;
282

    
283
    clear_blocks = clear_blocks_axp;
284

    
285
    /* amask clears all bits that correspond to present features.  */
286
    if (amask(AMASK_MVI) == 0) {
287
        put_pixels_clamped = put_pixels_clamped_mvi_asm;
288
        add_pixels_clamped = add_pixels_clamped_mvi_asm;
289
    }
290
}