Statistics
| Branch: | Revision:

ffmpeg / libavcodec / alpha / dsputil_alpha.c @ bb7d4939

History | View | Annotate | Download (7.05 KB)

1
/*
2
 * Alpha optimized DSP utils
3
 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 */
19

    
20
#include "asm.h"
21
#include "../dsputil.h"
22

    
23
void simple_idct_axp(DCTELEM *block);
24

    
25
void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
26
                                int line_size);
27
void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
28
                                int line_size);
29

    
30
#if 0
31
/* These functions were the base for the optimized assembler routines,
32
   and remain here for documentation purposes.  */
33
static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 
34
                                   int line_size)
35
{
36
    int i = 8;
37
    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
38

39
    ASM_ACCEPT_MVI;
40

41
    do {
42
        uint64_t shorts0, shorts1;
43

44
        shorts0 = ldq(block);
45
        shorts0 = maxsw4(shorts0, 0);
46
        shorts0 = minsw4(shorts0, clampmask);
47
        stl(pkwb(shorts0), pixels);
48

49
        shorts1 = ldq(block + 4);
50
        shorts1 = maxsw4(shorts1, 0);
51
        shorts1 = minsw4(shorts1, clampmask);
52
        stl(pkwb(shorts1), pixels + 4);
53

54
        pixels += line_size;
55
        block += 8;
56
    } while (--i);
57
}
58

59
void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 
60
                            int line_size)
61
{
62
    int h = 8;
63
    /* Keep this function a leaf function by generating the constants
64
       manually (mainly for the hack value ;-).  */
65
    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
66
    uint64_t signmask  = zap(-1, 0x33);
67
    signmask ^= signmask >> 1;  /* 0x8000800080008000 */
68

69
    ASM_ACCEPT_MVI;
70

71
    do {
72
        uint64_t shorts0, pix0, signs0;
73
        uint64_t shorts1, pix1, signs1;
74

75
        shorts0 = ldq(block);
76
        shorts1 = ldq(block + 4);
77

78
        pix0    = unpkbw(ldl(pixels));
79
        /* Signed subword add (MMX paddw).  */
80
        signs0  = shorts0 & signmask;
81
        shorts0 &= ~signmask;
82
        shorts0 += pix0;
83
        shorts0 ^= signs0;
84
        /* Clamp. */
85
        shorts0 = maxsw4(shorts0, 0);
86
        shorts0 = minsw4(shorts0, clampmask);   
87

88
        /* Next 4.  */
89
        pix1    = unpkbw(ldl(pixels + 4));
90
        signs1  = shorts1 & signmask;
91
        shorts1 &= ~signmask;
92
        shorts1 += pix1;
93
        shorts1 ^= signs1;
94
        shorts1 = maxsw4(shorts1, 0);
95
        shorts1 = minsw4(shorts1, clampmask);
96

97
        stl(pkwb(shorts0), pixels);
98
        stl(pkwb(shorts1), pixels + 4);
99

100
        pixels += line_size;
101
        block += 8;
102
    } while (--h);
103
}
104
#endif
105

    
106
/* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
107
   Since the immediate result could be greater than 255, we do the
108
   shift first. The result is too low by one if the bytes were both
109
   odd, so we need to add (l1 & l2) & BYTE_VEC(0x01).  */
110
static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2)
111
{
112
    UINT64 correction = (l1 & l2) & BYTE_VEC(0x01);
113
    l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
114
    l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
115
    return l1 + l2 + correction;
116
}
117

    
118
/* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1
119
   The '1' only has an effect when one byte is even and the other odd,
120
   i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01).
121
   Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01).  */
122
static inline UINT64 avg2(UINT64 l1, UINT64 l2)
123
{
124
    UINT64 correction = (l1 | l2) & BYTE_VEC(0x01);
125
    l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
126
    l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
127
    return l1 + l2 + correction;
128
}
129

    
130
static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
131
{
132
    UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
133
              + ((l2 & ~BYTE_VEC(0x03)) >> 2)
134
              + ((l3 & ~BYTE_VEC(0x03)) >> 2)
135
              + ((l4 & ~BYTE_VEC(0x03)) >> 2);
136
    UINT64 r2 = ((  (l1 & BYTE_VEC(0x03))
137
                  + (l2 & BYTE_VEC(0x03))
138
                  + (l3 & BYTE_VEC(0x03))
139
                  + (l4 & BYTE_VEC(0x03))
140
                  + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
141
    return r1 + r2;
142
}
143

    
144
static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
145
{
146
    UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
147
              + ((l2 & ~BYTE_VEC(0x03)) >> 2)
148
              + ((l3 & ~BYTE_VEC(0x03)) >> 2)
149
              + ((l4 & ~BYTE_VEC(0x03)) >> 2);
150
    UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
151
                 + (l2 & BYTE_VEC(0x03))
152
                 + (l3 & BYTE_VEC(0x03))
153
                 + (l4 & BYTE_VEC(0x03))
154
                 + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03);
155
    return r1 + r2;
156
}
157

    
158
#define PIXOPNAME(suffix) put ## suffix
159
#define BTYPE UINT8
160
#define AVG2 avg2
161
#define AVG4 avg4
162
#define STORE(l, b) stq(l, b)
163
#include "pixops.h"
164
#undef PIXOPNAME
165
#undef BTYPE
166
#undef AVG2
167
#undef AVG4
168
#undef STORE
169

    
170
#define PIXOPNAME(suffix) put_no_rnd ## suffix
171
#define BTYPE UINT8
172
#define AVG2 avg2_no_rnd
173
#define AVG4 avg4_no_rnd
174
#define STORE(l, b) stq(l, b)
175
#include "pixops.h"
176
#undef PIXOPNAME
177
#undef BTYPE
178
#undef AVG2
179
#undef AVG4
180
#undef STORE
181

    
182
/* The following functions are untested.  */
183
#if 0
184

185
#define PIXOPNAME(suffix) avg ## suffix
186
#define BTYPE UINT8
187
#define AVG2 avg2
188
#define AVG4 avg4
189
#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
190
#include "pixops.h"
191
#undef PIXOPNAME
192
#undef BTYPE
193
#undef AVG2
194
#undef AVG4
195
#undef STORE
196

197
#define PIXOPNAME(suffix) avg_no_rnd ## suffix
198
#define BTYPE UINT8
199
#define AVG2 avg2_no_rnd
200
#define AVG4 avg4_no_rnd
201
#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
202
#include "pixops.h"
203
#undef PIXOPNAME
204
#undef BTYPE
205
#undef AVG2
206
#undef AVG4
207
#undef STORE
208

209
#define PIXOPNAME(suffix) sub ## suffix
210
#define BTYPE DCTELEM
211
#define AVG2 avg2
212
#define AVG4 avg4
213
#define STORE(l, block) do {                \
214
    UINT64 xxx = l;                        \
215
    (block)[0] -= (xxx >>  0) & 0xff;        \
216
    (block)[1] -= (xxx >>  8) & 0xff;        \
217
    (block)[2] -= (xxx >> 16) & 0xff;        \
218
    (block)[3] -= (xxx >> 24) & 0xff;        \
219
    (block)[4] -= (xxx >> 32) & 0xff;        \
220
    (block)[5] -= (xxx >> 40) & 0xff;        \
221
    (block)[6] -= (xxx >> 48) & 0xff;        \
222
    (block)[7] -= (xxx >> 56) & 0xff;        \
223
} while (0)
224
#include "pixops.h"
225
#undef PIXOPNAME
226
#undef BTYPE
227
#undef AVG2
228
#undef AVG4
229
#undef STORE
230

231
#endif
232

    
233
void dsputil_init_alpha(void)
234
{
235
    put_pixels_tab[0] = put_pixels_axp;
236
    put_pixels_tab[1] = put_pixels_x2_axp;
237
    put_pixels_tab[2] = put_pixels_y2_axp;
238
    put_pixels_tab[3] = put_pixels_xy2_axp;
239

    
240
    put_no_rnd_pixels_tab[0] = put_pixels_axp;
241
    put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
242
    put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
243
    put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
244

    
245
    /* amask clears all bits that correspond to present features.  */
246
    if (amask(AMASK_MVI) == 0) {
247
        put_pixels_clamped = put_pixels_clamped_mvi_asm;
248
        add_pixels_clamped = add_pixels_clamped_mvi_asm;
249
    }
250
}