Statistics
| Branch: | Revision:

ffmpeg / libavcodec / alpha / simple_idct_alpha.c @ 2e63619f

History | View | Annotate | Download (7.45 KB)

1
/*
2
 * Simple IDCT (Alpha optimized)
3
 *
4
 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * based upon some outcommented C code from mpeg2dec (idct_mmx.c
7
 * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
8
 *
9
 * Alpha optimizations by Måns Rullgård <mans@mansr.com>
10
 *                     and Falk Hueffner <falk@debian.org>
11
 *
12
 * This file is part of FFmpeg.
13
 *
14
 * FFmpeg is free software; you can redistribute it and/or
15
 * modify it under the terms of the GNU Lesser General Public
16
 * License as published by the Free Software Foundation; either
17
 * version 2.1 of the License, or (at your option) any later version.
18
 *
19
 * FFmpeg is distributed in the hope that it will be useful,
20
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22
 * Lesser General Public License for more details.
23
 *
24
 * You should have received a copy of the GNU Lesser General Public
25
 * License along with FFmpeg; if not, write to the Free Software
26
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27
 */
28

    
29
#include "libavcodec/dsputil.h"
30
#include "dsputil_alpha.h"
31
#include "asm.h"
32

    
33
// cos(i * M_PI / 16) * sqrt(2) * (1 << 14)
34
// W4 is actually exactly 16384, but using 16383 works around
35
// accumulating rounding errors for some encoders
36
#define W1 ((int_fast32_t) 22725)
37
#define W2 ((int_fast32_t) 21407)
38
#define W3 ((int_fast32_t) 19266)
39
#define W4 ((int_fast32_t) 16383)
40
#define W5 ((int_fast32_t) 12873)
41
#define W6 ((int_fast32_t)  8867)
42
#define W7 ((int_fast32_t)  4520)
43
#define ROW_SHIFT 11
44
#define COL_SHIFT 20
45

    
46
/* 0: all entries 0, 1: only first entry nonzero, 2: otherwise  */
47
static inline int idct_row(DCTELEM *row)
48
{
49
    int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3, t;
50
    uint64_t l, r, t2;
51
    l = ldq(row);
52
    r = ldq(row + 4);
53

    
54
    if (l == 0 && r == 0)
55
        return 0;
56

    
57
    a0 = W4 * sextw(l) + (1 << (ROW_SHIFT - 1));
58

    
59
    if (((l & ~0xffffUL) | r) == 0) {
60
        a0 >>= ROW_SHIFT;
61
        t2 = (uint16_t) a0;
62
        t2 |= t2 << 16;
63
        t2 |= t2 << 32;
64

    
65
        stq(t2, row);
66
        stq(t2, row + 4);
67
        return 1;
68
    }
69

    
70
    a1 = a0;
71
    a2 = a0;
72
    a3 = a0;
73

    
74
    t = extwl(l, 4);            /* row[2] */
75
    if (t != 0) {
76
        t = sextw(t);
77
        a0 += W2 * t;
78
        a1 += W6 * t;
79
        a2 -= W6 * t;
80
        a3 -= W2 * t;
81
    }
82

    
83
    t = extwl(r, 0);            /* row[4] */
84
    if (t != 0) {
85
        t = sextw(t);
86
        a0 += W4 * t;
87
        a1 -= W4 * t;
88
        a2 -= W4 * t;
89
        a3 += W4 * t;
90
    }
91

    
92
    t = extwl(r, 4);            /* row[6] */
93
    if (t != 0) {
94
        t = sextw(t);
95
        a0 += W6 * t;
96
        a1 -= W2 * t;
97
        a2 += W2 * t;
98
        a3 -= W6 * t;
99
    }
100

    
101
    t = extwl(l, 2);            /* row[1] */
102
    if (t != 0) {
103
        t = sextw(t);
104
        b0 = W1 * t;
105
        b1 = W3 * t;
106
        b2 = W5 * t;
107
        b3 = W7 * t;
108
    } else {
109
        b0 = 0;
110
        b1 = 0;
111
        b2 = 0;
112
        b3 = 0;
113
    }
114

    
115
    t = extwl(l, 6);            /* row[3] */
116
    if (t) {
117
        t = sextw(t);
118
        b0 += W3 * t;
119
        b1 -= W7 * t;
120
        b2 -= W1 * t;
121
        b3 -= W5 * t;
122
    }
123

    
124

    
125
    t = extwl(r, 2);            /* row[5] */
126
    if (t) {
127
        t = sextw(t);
128
        b0 += W5 * t;
129
        b1 -= W1 * t;
130
        b2 += W7 * t;
131
        b3 += W3 * t;
132
    }
133

    
134
    t = extwl(r, 6);            /* row[7] */
135
    if (t) {
136
        t = sextw(t);
137
        b0 += W7 * t;
138
        b1 -= W5 * t;
139
        b2 += W3 * t;
140
        b3 -= W1 * t;
141
    }
142

    
143
    row[0] = (a0 + b0) >> ROW_SHIFT;
144
    row[1] = (a1 + b1) >> ROW_SHIFT;
145
    row[2] = (a2 + b2) >> ROW_SHIFT;
146
    row[3] = (a3 + b3) >> ROW_SHIFT;
147
    row[4] = (a3 - b3) >> ROW_SHIFT;
148
    row[5] = (a2 - b2) >> ROW_SHIFT;
149
    row[6] = (a1 - b1) >> ROW_SHIFT;
150
    row[7] = (a0 - b0) >> ROW_SHIFT;
151

    
152
    return 2;
153
}
154

    
155
static inline void idct_col(DCTELEM *col)
156
{
157
    int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3;
158

    
159
    col[0] += (1 << (COL_SHIFT - 1)) / W4;
160

    
161
    a0 = W4 * col[8 * 0];
162
    a1 = W4 * col[8 * 0];
163
    a2 = W4 * col[8 * 0];
164
    a3 = W4 * col[8 * 0];
165

    
166
    if (col[8 * 2]) {
167
        a0 += W2 * col[8 * 2];
168
        a1 += W6 * col[8 * 2];
169
        a2 -= W6 * col[8 * 2];
170
        a3 -= W2 * col[8 * 2];
171
    }
172

    
173
    if (col[8 * 4]) {
174
        a0 += W4 * col[8 * 4];
175
        a1 -= W4 * col[8 * 4];
176
        a2 -= W4 * col[8 * 4];
177
        a3 += W4 * col[8 * 4];
178
    }
179

    
180
    if (col[8 * 6]) {
181
        a0 += W6 * col[8 * 6];
182
        a1 -= W2 * col[8 * 6];
183
        a2 += W2 * col[8 * 6];
184
        a3 -= W6 * col[8 * 6];
185
    }
186

    
187
    if (col[8 * 1]) {
188
        b0 = W1 * col[8 * 1];
189
        b1 = W3 * col[8 * 1];
190
        b2 = W5 * col[8 * 1];
191
        b3 = W7 * col[8 * 1];
192
    } else {
193
        b0 = 0;
194
        b1 = 0;
195
        b2 = 0;
196
        b3 = 0;
197
    }
198

    
199
    if (col[8 * 3]) {
200
        b0 += W3 * col[8 * 3];
201
        b1 -= W7 * col[8 * 3];
202
        b2 -= W1 * col[8 * 3];
203
        b3 -= W5 * col[8 * 3];
204
    }
205

    
206
    if (col[8 * 5]) {
207
        b0 += W5 * col[8 * 5];
208
        b1 -= W1 * col[8 * 5];
209
        b2 += W7 * col[8 * 5];
210
        b3 += W3 * col[8 * 5];
211
    }
212

    
213
    if (col[8 * 7]) {
214
        b0 += W7 * col[8 * 7];
215
        b1 -= W5 * col[8 * 7];
216
        b2 += W3 * col[8 * 7];
217
        b3 -= W1 * col[8 * 7];
218
    }
219

    
220
    col[8 * 0] = (a0 + b0) >> COL_SHIFT;
221
    col[8 * 7] = (a0 - b0) >> COL_SHIFT;
222
    col[8 * 1] = (a1 + b1) >> COL_SHIFT;
223
    col[8 * 6] = (a1 - b1) >> COL_SHIFT;
224
    col[8 * 2] = (a2 + b2) >> COL_SHIFT;
225
    col[8 * 5] = (a2 - b2) >> COL_SHIFT;
226
    col[8 * 3] = (a3 + b3) >> COL_SHIFT;
227
    col[8 * 4] = (a3 - b3) >> COL_SHIFT;
228
}
229

    
230
/* If all rows but the first one are zero after row transformation,
231
   all rows will be identical after column transformation.  */
232
static inline void idct_col2(DCTELEM *col)
233
{
234
    int i;
235
    uint64_t l, r;
236

    
237
    for (i = 0; i < 8; ++i) {
238
        int_fast32_t a0 = col[i] + (1 << (COL_SHIFT - 1)) / W4;
239

    
240
        a0 *= W4;
241
        col[i] = a0 >> COL_SHIFT;
242
    }
243

    
244
    l = ldq(col + 0 * 4); r = ldq(col + 1 * 4);
245
    stq(l, col +  2 * 4); stq(r, col +  3 * 4);
246
    stq(l, col +  4 * 4); stq(r, col +  5 * 4);
247
    stq(l, col +  6 * 4); stq(r, col +  7 * 4);
248
    stq(l, col +  8 * 4); stq(r, col +  9 * 4);
249
    stq(l, col + 10 * 4); stq(r, col + 11 * 4);
250
    stq(l, col + 12 * 4); stq(r, col + 13 * 4);
251
    stq(l, col + 14 * 4); stq(r, col + 15 * 4);
252
}
253

    
254
void ff_simple_idct_axp(DCTELEM *block)
255
{
256

    
257
    int i;
258
    int rowsZero = 1;           /* all rows except row 0 zero */
259
    int rowsConstant = 1;       /* all rows consist of a constant value */
260

    
261
    for (i = 0; i < 8; i++) {
262
        int sparseness = idct_row(block + 8 * i);
263

    
264
        if (i > 0 && sparseness > 0)
265
            rowsZero = 0;
266
        if (sparseness == 2)
267
            rowsConstant = 0;
268
    }
269

    
270
    if (rowsZero) {
271
        idct_col2(block);
272
    } else if (rowsConstant) {
273
        idct_col(block);
274
        for (i = 0; i < 8; i += 2) {
275
            uint64_t v = (uint16_t) block[0];
276
            uint64_t w = (uint16_t) block[8];
277

    
278
            v |= v << 16;
279
            w |= w << 16;
280
            v |= v << 32;
281
            w |= w << 32;
282
            stq(v, block + 0 * 4);
283
            stq(v, block + 1 * 4);
284
            stq(w, block + 2 * 4);
285
            stq(w, block + 3 * 4);
286
            block += 4 * 4;
287
        }
288
    } else {
289
        for (i = 0; i < 8; i++)
290
            idct_col(block + i);
291
    }
292
}
293

    
294
void ff_simple_idct_put_axp(uint8_t *dest, int line_size, DCTELEM *block)
295
{
296
    ff_simple_idct_axp(block);
297
    put_pixels_clamped_axp_p(block, dest, line_size);
298
}
299

    
300
void ff_simple_idct_add_axp(uint8_t *dest, int line_size, DCTELEM *block)
301
{
302
    ff_simple_idct_axp(block);
303
    add_pixels_clamped_axp_p(block, dest, line_size);
304
}