ffmpeg / libavcodec / alpha / simple_idct_alpha.c @ ad1862d6
History | View | Annotate | Download (7.31 KB)
1 |
/*
|
---|---|
2 |
* Simple IDCT (Alpha optimized)
|
3 |
*
|
4 |
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
|
5 |
*
|
6 |
* based upon some outcommented C code from mpeg2dec (idct_mmx.c
|
7 |
* written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
|
8 |
*
|
9 |
* Alpha optimizations by Måns Rullgård <mans@mansr.com>
|
10 |
* and Falk Hueffner <falk@debian.org>
|
11 |
*
|
12 |
* This file is part of Libav.
|
13 |
*
|
14 |
* Libav is free software; you can redistribute it and/or
|
15 |
* modify it under the terms of the GNU Lesser General Public
|
16 |
* License as published by the Free Software Foundation; either
|
17 |
* version 2.1 of the License, or (at your option) any later version.
|
18 |
*
|
19 |
* Libav is distributed in the hope that it will be useful,
|
20 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
21 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
22 |
* Lesser General Public License for more details.
|
23 |
*
|
24 |
* You should have received a copy of the GNU Lesser General Public
|
25 |
* License along with Libav; if not, write to the Free Software
|
26 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
27 |
*/
|
28 |
|
29 |
#include "libavcodec/dsputil.h" |
30 |
#include "dsputil_alpha.h" |
31 |
#include "asm.h" |
32 |
|
33 |
// cos(i * M_PI / 16) * sqrt(2) * (1 << 14)
|
34 |
// W4 is actually exactly 16384, but using 16383 works around
|
35 |
// accumulating rounding errors for some encoders
|
36 |
#define W1 22725 |
37 |
#define W2 21407 |
38 |
#define W3 19266 |
39 |
#define W4 16383 |
40 |
#define W5 12873 |
41 |
#define W6 8867 |
42 |
#define W7 4520 |
43 |
#define ROW_SHIFT 11 |
44 |
#define COL_SHIFT 20 |
45 |
|
46 |
/* 0: all entries 0, 1: only first entry nonzero, 2: otherwise */
|
47 |
static inline int idct_row(DCTELEM *row) |
48 |
{ |
49 |
int a0, a1, a2, a3, b0, b1, b2, b3, t;
|
50 |
uint64_t l, r, t2; |
51 |
l = ldq(row); |
52 |
r = ldq(row + 4);
|
53 |
|
54 |
if (l == 0 && r == 0) |
55 |
return 0; |
56 |
|
57 |
a0 = W4 * sextw(l) + (1 << (ROW_SHIFT - 1)); |
58 |
|
59 |
if (((l & ~0xffffUL) | r) == 0) { |
60 |
a0 >>= ROW_SHIFT; |
61 |
t2 = (uint16_t) a0; |
62 |
t2 |= t2 << 16;
|
63 |
t2 |= t2 << 32;
|
64 |
|
65 |
stq(t2, row); |
66 |
stq(t2, row + 4);
|
67 |
return 1; |
68 |
} |
69 |
|
70 |
a1 = a0; |
71 |
a2 = a0; |
72 |
a3 = a0; |
73 |
|
74 |
t = extwl(l, 4); /* row[2] */ |
75 |
if (t != 0) { |
76 |
t = sextw(t); |
77 |
a0 += W2 * t; |
78 |
a1 += W6 * t; |
79 |
a2 -= W6 * t; |
80 |
a3 -= W2 * t; |
81 |
} |
82 |
|
83 |
t = extwl(r, 0); /* row[4] */ |
84 |
if (t != 0) { |
85 |
t = sextw(t); |
86 |
a0 += W4 * t; |
87 |
a1 -= W4 * t; |
88 |
a2 -= W4 * t; |
89 |
a3 += W4 * t; |
90 |
} |
91 |
|
92 |
t = extwl(r, 4); /* row[6] */ |
93 |
if (t != 0) { |
94 |
t = sextw(t); |
95 |
a0 += W6 * t; |
96 |
a1 -= W2 * t; |
97 |
a2 += W2 * t; |
98 |
a3 -= W6 * t; |
99 |
} |
100 |
|
101 |
t = extwl(l, 2); /* row[1] */ |
102 |
if (t != 0) { |
103 |
t = sextw(t); |
104 |
b0 = W1 * t; |
105 |
b1 = W3 * t; |
106 |
b2 = W5 * t; |
107 |
b3 = W7 * t; |
108 |
} else {
|
109 |
b0 = 0;
|
110 |
b1 = 0;
|
111 |
b2 = 0;
|
112 |
b3 = 0;
|
113 |
} |
114 |
|
115 |
t = extwl(l, 6); /* row[3] */ |
116 |
if (t) {
|
117 |
t = sextw(t); |
118 |
b0 += W3 * t; |
119 |
b1 -= W7 * t; |
120 |
b2 -= W1 * t; |
121 |
b3 -= W5 * t; |
122 |
} |
123 |
|
124 |
|
125 |
t = extwl(r, 2); /* row[5] */ |
126 |
if (t) {
|
127 |
t = sextw(t); |
128 |
b0 += W5 * t; |
129 |
b1 -= W1 * t; |
130 |
b2 += W7 * t; |
131 |
b3 += W3 * t; |
132 |
} |
133 |
|
134 |
t = extwl(r, 6); /* row[7] */ |
135 |
if (t) {
|
136 |
t = sextw(t); |
137 |
b0 += W7 * t; |
138 |
b1 -= W5 * t; |
139 |
b2 += W3 * t; |
140 |
b3 -= W1 * t; |
141 |
} |
142 |
|
143 |
row[0] = (a0 + b0) >> ROW_SHIFT;
|
144 |
row[1] = (a1 + b1) >> ROW_SHIFT;
|
145 |
row[2] = (a2 + b2) >> ROW_SHIFT;
|
146 |
row[3] = (a3 + b3) >> ROW_SHIFT;
|
147 |
row[4] = (a3 - b3) >> ROW_SHIFT;
|
148 |
row[5] = (a2 - b2) >> ROW_SHIFT;
|
149 |
row[6] = (a1 - b1) >> ROW_SHIFT;
|
150 |
row[7] = (a0 - b0) >> ROW_SHIFT;
|
151 |
|
152 |
return 2; |
153 |
} |
154 |
|
155 |
static inline void idct_col(DCTELEM *col) |
156 |
{ |
157 |
int a0, a1, a2, a3, b0, b1, b2, b3;
|
158 |
|
159 |
col[0] += (1 << (COL_SHIFT - 1)) / W4; |
160 |
|
161 |
a0 = W4 * col[8 * 0]; |
162 |
a1 = W4 * col[8 * 0]; |
163 |
a2 = W4 * col[8 * 0]; |
164 |
a3 = W4 * col[8 * 0]; |
165 |
|
166 |
if (col[8 * 2]) { |
167 |
a0 += W2 * col[8 * 2]; |
168 |
a1 += W6 * col[8 * 2]; |
169 |
a2 -= W6 * col[8 * 2]; |
170 |
a3 -= W2 * col[8 * 2]; |
171 |
} |
172 |
|
173 |
if (col[8 * 4]) { |
174 |
a0 += W4 * col[8 * 4]; |
175 |
a1 -= W4 * col[8 * 4]; |
176 |
a2 -= W4 * col[8 * 4]; |
177 |
a3 += W4 * col[8 * 4]; |
178 |
} |
179 |
|
180 |
if (col[8 * 6]) { |
181 |
a0 += W6 * col[8 * 6]; |
182 |
a1 -= W2 * col[8 * 6]; |
183 |
a2 += W2 * col[8 * 6]; |
184 |
a3 -= W6 * col[8 * 6]; |
185 |
} |
186 |
|
187 |
if (col[8 * 1]) { |
188 |
b0 = W1 * col[8 * 1]; |
189 |
b1 = W3 * col[8 * 1]; |
190 |
b2 = W5 * col[8 * 1]; |
191 |
b3 = W7 * col[8 * 1]; |
192 |
} else {
|
193 |
b0 = 0;
|
194 |
b1 = 0;
|
195 |
b2 = 0;
|
196 |
b3 = 0;
|
197 |
} |
198 |
|
199 |
if (col[8 * 3]) { |
200 |
b0 += W3 * col[8 * 3]; |
201 |
b1 -= W7 * col[8 * 3]; |
202 |
b2 -= W1 * col[8 * 3]; |
203 |
b3 -= W5 * col[8 * 3]; |
204 |
} |
205 |
|
206 |
if (col[8 * 5]) { |
207 |
b0 += W5 * col[8 * 5]; |
208 |
b1 -= W1 * col[8 * 5]; |
209 |
b2 += W7 * col[8 * 5]; |
210 |
b3 += W3 * col[8 * 5]; |
211 |
} |
212 |
|
213 |
if (col[8 * 7]) { |
214 |
b0 += W7 * col[8 * 7]; |
215 |
b1 -= W5 * col[8 * 7]; |
216 |
b2 += W3 * col[8 * 7]; |
217 |
b3 -= W1 * col[8 * 7]; |
218 |
} |
219 |
|
220 |
col[8 * 0] = (a0 + b0) >> COL_SHIFT; |
221 |
col[8 * 7] = (a0 - b0) >> COL_SHIFT; |
222 |
col[8 * 1] = (a1 + b1) >> COL_SHIFT; |
223 |
col[8 * 6] = (a1 - b1) >> COL_SHIFT; |
224 |
col[8 * 2] = (a2 + b2) >> COL_SHIFT; |
225 |
col[8 * 5] = (a2 - b2) >> COL_SHIFT; |
226 |
col[8 * 3] = (a3 + b3) >> COL_SHIFT; |
227 |
col[8 * 4] = (a3 - b3) >> COL_SHIFT; |
228 |
} |
229 |
|
230 |
/* If all rows but the first one are zero after row transformation,
|
231 |
all rows will be identical after column transformation. */
|
232 |
static inline void idct_col2(DCTELEM *col) |
233 |
{ |
234 |
int i;
|
235 |
uint64_t l, r; |
236 |
|
237 |
for (i = 0; i < 8; ++i) { |
238 |
int a0 = col[i] + (1 << (COL_SHIFT - 1)) / W4; |
239 |
|
240 |
a0 *= W4; |
241 |
col[i] = a0 >> COL_SHIFT; |
242 |
} |
243 |
|
244 |
l = ldq(col + 0 * 4); r = ldq(col + 1 * 4); |
245 |
stq(l, col + 2 * 4); stq(r, col + 3 * 4); |
246 |
stq(l, col + 4 * 4); stq(r, col + 5 * 4); |
247 |
stq(l, col + 6 * 4); stq(r, col + 7 * 4); |
248 |
stq(l, col + 8 * 4); stq(r, col + 9 * 4); |
249 |
stq(l, col + 10 * 4); stq(r, col + 11 * 4); |
250 |
stq(l, col + 12 * 4); stq(r, col + 13 * 4); |
251 |
stq(l, col + 14 * 4); stq(r, col + 15 * 4); |
252 |
} |
253 |
|
254 |
void ff_simple_idct_axp(DCTELEM *block)
|
255 |
{ |
256 |
|
257 |
int i;
|
258 |
int rowsZero = 1; /* all rows except row 0 zero */ |
259 |
int rowsConstant = 1; /* all rows consist of a constant value */ |
260 |
|
261 |
for (i = 0; i < 8; i++) { |
262 |
int sparseness = idct_row(block + 8 * i); |
263 |
|
264 |
if (i > 0 && sparseness > 0) |
265 |
rowsZero = 0;
|
266 |
if (sparseness == 2) |
267 |
rowsConstant = 0;
|
268 |
} |
269 |
|
270 |
if (rowsZero) {
|
271 |
idct_col2(block); |
272 |
} else if (rowsConstant) { |
273 |
idct_col(block); |
274 |
for (i = 0; i < 8; i += 2) { |
275 |
uint64_t v = (uint16_t) block[0];
|
276 |
uint64_t w = (uint16_t) block[8];
|
277 |
|
278 |
v |= v << 16;
|
279 |
w |= w << 16;
|
280 |
v |= v << 32;
|
281 |
w |= w << 32;
|
282 |
stq(v, block + 0 * 4); |
283 |
stq(v, block + 1 * 4); |
284 |
stq(w, block + 2 * 4); |
285 |
stq(w, block + 3 * 4); |
286 |
block += 4 * 4; |
287 |
} |
288 |
} else {
|
289 |
for (i = 0; i < 8; i++) |
290 |
idct_col(block + i); |
291 |
} |
292 |
} |
293 |
|
294 |
void ff_simple_idct_put_axp(uint8_t *dest, int line_size, DCTELEM *block) |
295 |
{ |
296 |
ff_simple_idct_axp(block); |
297 |
put_pixels_clamped_axp_p(block, dest, line_size); |
298 |
} |
299 |
|
300 |
void ff_simple_idct_add_axp(uint8_t *dest, int line_size, DCTELEM *block) |
301 |
{ |
302 |
ff_simple_idct_axp(block); |
303 |
add_pixels_clamped_axp_p(block, dest, line_size); |
304 |
} |