ffmpeg / libavcodec / ppc / mpegvideo_altivec.c @ 5509bffa
History | View | Annotate | Download (24.7 KB)
1 | 05c4072b | Michael Niedermayer | /*
|
---|---|---|---|
2 | * Copyright (c) 2002 Dieter Shirley
|
||
3 | *
|
||
4 | a1947624 | Romain Dolbeau | * dct_unquantize_h263_altivec:
|
5 | * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
|
||
6 | *
|
||
7 | 05c4072b | Michael Niedermayer | * This library is free software; you can redistribute it and/or
|
8 | * modify it under the terms of the GNU Lesser General Public
|
||
9 | * License as published by the Free Software Foundation; either
|
||
10 | * version 2 of the License, or (at your option) any later version.
|
||
11 | *
|
||
12 | * This library is distributed in the hope that it will be useful,
|
||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
15 | * Lesser General Public License for more details.
|
||
16 | *
|
||
17 | * You should have received a copy of the GNU Lesser General Public
|
||
18 | * License along with this library; if not, write to the Free Software
|
||
19 | 5509bffa | Diego Biurrun | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
20 | 05c4072b | Michael Niedermayer | */
|
21 | a9a07762 | Michael Niedermayer | |
22 | 05c4072b | Michael Niedermayer | #include <stdlib.h> |
23 | #include <stdio.h> |
||
24 | #include "../dsputil.h" |
||
25 | #include "../mpegvideo.h" |
||
26 | a9a07762 | Michael Niedermayer | |
27 | #include "gcc_fixes.h" |
||
28 | 115329f1 | Diego Biurrun | |
29 | db40a39a | Michael Niedermayer | #include "dsputil_altivec.h" |
30 | 05c4072b | Michael Niedermayer | |
31 | // Swaps two variables (used for altivec registers)
|
||
32 | #define SWAP(a,b) \
|
||
33 | do { \
|
||
34 | __typeof__(a) swap_temp=a; \ |
||
35 | a=b; \ |
||
36 | b=swap_temp; \ |
||
37 | } while (0) |
||
38 | |||
39 | // transposes a matrix consisting of four vectors with four elements each
|
||
40 | #define TRANSPOSE4(a,b,c,d) \
|
||
41 | do { \
|
||
42 | __typeof__(a) _trans_ach = vec_mergeh(a, c); \ |
||
43 | __typeof__(a) _trans_acl = vec_mergel(a, c); \ |
||
44 | __typeof__(a) _trans_bdh = vec_mergeh(b, d); \ |
||
45 | __typeof__(a) _trans_bdl = vec_mergel(b, d); \ |
||
46 | \ |
||
47 | a = vec_mergeh(_trans_ach, _trans_bdh); \ |
||
48 | b = vec_mergel(_trans_ach, _trans_bdh); \ |
||
49 | c = vec_mergeh(_trans_acl, _trans_bdl); \ |
||
50 | d = vec_mergel(_trans_acl, _trans_bdl); \ |
||
51 | } while (0) |
||
52 | |||
53 | #define TRANSPOSE8(a,b,c,d,e,f,g,h) \
|
||
54 | do { \
|
||
55 | __typeof__(a) _A1, _B1, _C1, _D1, _E1, _F1, _G1, _H1; \ |
||
56 | __typeof__(a) _A2, _B2, _C2, _D2, _E2, _F2, _G2, _H2; \ |
||
57 | \ |
||
58 | _A1 = vec_mergeh (a, e); \ |
||
59 | _B1 = vec_mergel (a, e); \ |
||
60 | _C1 = vec_mergeh (b, f); \ |
||
61 | _D1 = vec_mergel (b, f); \ |
||
62 | _E1 = vec_mergeh (c, g); \ |
||
63 | _F1 = vec_mergel (c, g); \ |
||
64 | _G1 = vec_mergeh (d, h); \ |
||
65 | _H1 = vec_mergel (d, h); \ |
||
66 | \ |
||
67 | _A2 = vec_mergeh (_A1, _E1); \ |
||
68 | _B2 = vec_mergel (_A1, _E1); \ |
||
69 | _C2 = vec_mergeh (_B1, _F1); \ |
||
70 | _D2 = vec_mergel (_B1, _F1); \ |
||
71 | _E2 = vec_mergeh (_C1, _G1); \ |
||
72 | _F2 = vec_mergel (_C1, _G1); \ |
||
73 | _G2 = vec_mergeh (_D1, _H1); \ |
||
74 | _H2 = vec_mergel (_D1, _H1); \ |
||
75 | \ |
||
76 | a = vec_mergeh (_A2, _E2); \ |
||
77 | b = vec_mergel (_A2, _E2); \ |
||
78 | c = vec_mergeh (_B2, _F2); \ |
||
79 | d = vec_mergel (_B2, _F2); \ |
||
80 | e = vec_mergeh (_C2, _G2); \ |
||
81 | f = vec_mergel (_C2, _G2); \ |
||
82 | g = vec_mergeh (_D2, _H2); \ |
||
83 | h = vec_mergel (_D2, _H2); \ |
||
84 | } while (0) |
||
85 | |||
86 | |||
87 | // Loads a four-byte value (int or float) from the target address
|
||
88 | // into every element in the target vector. Only works if the
|
||
89 | // target address is four-byte aligned (which should be always).
|
||
90 | #define LOAD4(vec, address) \
|
||
91 | { \ |
||
92 | __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \ |
||
93 | vector unsigned char _perm_vec = vec_lvsl(0,(address)); \ |
||
94 | vec = vec_ld(0, _load_addr); \
|
||
95 | vec = vec_perm(vec, vec, _perm_vec); \ |
||
96 | vec = vec_splat(vec, 0); \
|
||
97 | } |
||
98 | |||
99 | 3b991c54 | Romain Dolbeau | |
100 | #ifdef CONFIG_DARWIN
|
||
101 | #define FOUROF(a) (a)
|
||
102 | #else
|
||
103 | // slower, for dumb non-apple GCC
|
||
104 | #define FOUROF(a) {a,a,a,a}
|
||
105 | #endif
|
||
106 | 115329f1 | Diego Biurrun | int dct_quantize_altivec(MpegEncContext* s,
|
107 | 05c4072b | Michael Niedermayer | DCTELEM* data, int n,
|
108 | int qscale, int* overflow) |
||
109 | { |
||
110 | int lastNonZero;
|
||
111 | vector float row0, row1, row2, row3, row4, row5, row6, row7;
|
||
112 | vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7;
|
||
113 | aab34ca0 | Michael Niedermayer | const_vector float zero = (const_vector float)FOUROF(0.); |
114 | // used after quantise step
|
||
115 | int oldBaseValue = 0; |
||
116 | 05c4072b | Michael Niedermayer | |
117 | // Load the data into the row/alt vectors
|
||
118 | { |
||
119 | vector signed short data0, data1, data2, data3, data4, data5, data6, data7; |
||
120 | |||
121 | data0 = vec_ld(0, data);
|
||
122 | data1 = vec_ld(16, data);
|
||
123 | data2 = vec_ld(32, data);
|
||
124 | data3 = vec_ld(48, data);
|
||
125 | data4 = vec_ld(64, data);
|
||
126 | data5 = vec_ld(80, data);
|
||
127 | data6 = vec_ld(96, data);
|
||
128 | data7 = vec_ld(112, data);
|
||
129 | |||
130 | // Transpose the data before we start
|
||
131 | TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); |
||
132 | |||
133 | // load the data into floating point vectors. We load
|
||
134 | // the high half of each row into the main row vectors
|
||
135 | // and the low half into the alt vectors.
|
||
136 | row0 = vec_ctf(vec_unpackh(data0), 0);
|
||
137 | alt0 = vec_ctf(vec_unpackl(data0), 0);
|
||
138 | row1 = vec_ctf(vec_unpackh(data1), 0);
|
||
139 | alt1 = vec_ctf(vec_unpackl(data1), 0);
|
||
140 | row2 = vec_ctf(vec_unpackh(data2), 0);
|
||
141 | alt2 = vec_ctf(vec_unpackl(data2), 0);
|
||
142 | row3 = vec_ctf(vec_unpackh(data3), 0);
|
||
143 | alt3 = vec_ctf(vec_unpackl(data3), 0);
|
||
144 | row4 = vec_ctf(vec_unpackh(data4), 0);
|
||
145 | alt4 = vec_ctf(vec_unpackl(data4), 0);
|
||
146 | row5 = vec_ctf(vec_unpackh(data5), 0);
|
||
147 | alt5 = vec_ctf(vec_unpackl(data5), 0);
|
||
148 | row6 = vec_ctf(vec_unpackh(data6), 0);
|
||
149 | alt6 = vec_ctf(vec_unpackl(data6), 0);
|
||
150 | row7 = vec_ctf(vec_unpackh(data7), 0);
|
||
151 | alt7 = vec_ctf(vec_unpackl(data7), 0);
|
||
152 | } |
||
153 | |||
154 | // The following block could exist as a separate an altivec dct
|
||
155 | bb270c08 | Diego Biurrun | // function. However, if we put it inline, the DCT data can remain
|
156 | // in the vector local variables, as floats, which we'll use during the
|
||
157 | // quantize step...
|
||
158 | 05c4072b | Michael Niedermayer | { |
159 | 3b991c54 | Romain Dolbeau | const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f); |
160 | const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f); |
||
161 | const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f); |
||
162 | const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f); |
||
163 | const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f); |
||
164 | const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f); |
||
165 | const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f); |
||
166 | const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f); |
||
167 | const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f); |
||
168 | const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f); |
||
169 | const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f); |
||
170 | const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f); |
||
171 | 05c4072b | Michael Niedermayer | |
172 | |||
173 | int whichPass, whichHalf;
|
||
174 | |||
175 | for(whichPass = 1; whichPass<=2; whichPass++) |
||
176 | { |
||
177 | for(whichHalf = 1; whichHalf<=2; whichHalf++) |
||
178 | { |
||
179 | vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||
180 | vector float tmp10, tmp11, tmp12, tmp13;
|
||
181 | vector float z1, z2, z3, z4, z5;
|
||
182 | |||
183 | tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7];
|
||
184 | tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7];
|
||
185 | tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4];
|
||
186 | tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4];
|
||
187 | tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6];
|
||
188 | tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6];
|
||
189 | tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5];
|
||
190 | tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5];
|
||
191 | |||
192 | tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3;
|
||
193 | tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3;
|
||
194 | tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2;
|
||
195 | tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2;
|
||
196 | |||
197 | |||
198 | // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
|
||
199 | row0 = vec_add(tmp10, tmp11); |
||
200 | |||
201 | // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
|
||
202 | row4 = vec_sub(tmp10, tmp11); |
||
203 | |||
204 | |||
205 | // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
|
||
206 | z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero);
|
||
207 | |||
208 | // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
|
||
209 | bb270c08 | Diego Biurrun | // CONST_BITS-PASS1_BITS);
|
210 | 05c4072b | Michael Niedermayer | row2 = vec_madd(tmp13, vec_0_765366865, z1); |
211 | |||
212 | // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
|
||
213 | bb270c08 | Diego Biurrun | // CONST_BITS-PASS1_BITS);
|
214 | 05c4072b | Michael Niedermayer | row6 = vec_madd(tmp12, vec_1_847759065, z1); |
215 | |||
216 | z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7;
|
||
217 | z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6;
|
||
218 | z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6;
|
||
219 | z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7;
|
||
220 | |||
221 | // z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
|
||
222 | z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero);
|
||
223 | |||
224 | // z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
|
||
225 | z3 = vec_madd(z3, vec_1_961570560, z5); |
||
226 | |||
227 | // z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
|
||
228 | z4 = vec_madd(z4, vec_0_390180644, z5); |
||
229 | |||
230 | // The following adds are rolled into the multiplies above
|
||
231 | // z3 = vec_add(z3, z5); // z3 += z5;
|
||
232 | // z4 = vec_add(z4, z5); // z4 += z5;
|
||
233 | |||
234 | // z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
|
||
235 | // Wow! It's actually more effecient to roll this multiply
|
||
236 | // into the adds below, even thought the multiply gets done twice!
|
||
237 | // z2 = vec_madd(z2, vec_2_562915447, (vector float)zero);
|
||
238 | |||
239 | // z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
|
||
240 | // Same with this one...
|
||
241 | // z1 = vec_madd(z1, vec_0_899976223, (vector float)zero);
|
||
242 | |||
243 | // tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
|
||
244 | // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
|
||
245 | row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3)); |
||
246 | |||
247 | // tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
|
||
248 | // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
|
||
249 | row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4)); |
||
250 | |||
251 | // tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
|
||
252 | // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
|
||
253 | row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3)); |
||
254 | |||
255 | // tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
|
||
256 | // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
|
||
257 | row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4)); |
||
258 | |||
259 | // Swap the row values with the alts. If this is the first half,
|
||
260 | // this sets up the low values to be acted on in the second half.
|
||
261 | // If this is the second half, it puts the high values back in
|
||
262 | // the row values where they are expected to be when we're done.
|
||
263 | SWAP(row0, alt0); |
||
264 | SWAP(row1, alt1); |
||
265 | SWAP(row2, alt2); |
||
266 | SWAP(row3, alt3); |
||
267 | SWAP(row4, alt4); |
||
268 | SWAP(row5, alt5); |
||
269 | SWAP(row6, alt6); |
||
270 | SWAP(row7, alt7); |
||
271 | } |
||
272 | |||
273 | if (whichPass == 1) |
||
274 | { |
||
275 | // transpose the data for the second pass
|
||
276 | 115329f1 | Diego Biurrun | |
277 | 05c4072b | Michael Niedermayer | // First, block transpose the upper right with lower left.
|
278 | SWAP(row4, alt0); |
||
279 | SWAP(row5, alt1); |
||
280 | SWAP(row6, alt2); |
||
281 | SWAP(row7, alt3); |
||
282 | |||
283 | // Now, transpose each block of four
|
||
284 | TRANSPOSE4(row0, row1, row2, row3); |
||
285 | TRANSPOSE4(row4, row5, row6, row7); |
||
286 | TRANSPOSE4(alt0, alt1, alt2, alt3); |
||
287 | TRANSPOSE4(alt4, alt5, alt6, alt7); |
||
288 | } |
||
289 | } |
||
290 | } |
||
291 | |||
292 | // perform the quantise step, using the floating point data
|
||
293 | // still in the row/alt registers
|
||
294 | { |
||
295 | const int* biasAddr; |
||
296 | const vector signed int* qmat; |
||
297 | vector float bias, negBias;
|
||
298 | |||
299 | if (s->mb_intra)
|
||
300 | { |
||
301 | vector signed int baseVector; |
||
302 | |||
303 | // We must cache element 0 in the intra case
|
||
304 | // (it needs special handling).
|
||
305 | baseVector = vec_cts(vec_splat(row0, 0), 0); |
||
306 | vec_ste(baseVector, 0, &oldBaseValue);
|
||
307 | |||
308 | qmat = (vector signed int*)s->q_intra_matrix[qscale]; |
||
309 | biasAddr = &(s->intra_quant_bias); |
||
310 | } |
||
311 | else
|
||
312 | { |
||
313 | qmat = (vector signed int*)s->q_inter_matrix[qscale]; |
||
314 | biasAddr = &(s->inter_quant_bias); |
||
315 | } |
||
316 | |||
317 | // Load the bias vector (We add 0.5 to the bias so that we're
|
||
318 | bb270c08 | Diego Biurrun | // rounding when we convert to int, instead of flooring.)
|
319 | 05c4072b | Michael Niedermayer | { |
320 | vector signed int biasInt; |
||
321 | 3b991c54 | Romain Dolbeau | const vector float negOneFloat = (vector float)FOUROF(-1.0f); |
322 | 05c4072b | Michael Niedermayer | LOAD4(biasInt, biasAddr); |
323 | bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT); |
||
324 | negBias = vec_madd(bias, negOneFloat, zero); |
||
325 | } |
||
326 | |||
327 | { |
||
328 | vector float q0, q1, q2, q3, q4, q5, q6, q7;
|
||
329 | |||
330 | q0 = vec_ctf(qmat[0], QMAT_SHIFT);
|
||
331 | q1 = vec_ctf(qmat[2], QMAT_SHIFT);
|
||
332 | q2 = vec_ctf(qmat[4], QMAT_SHIFT);
|
||
333 | q3 = vec_ctf(qmat[6], QMAT_SHIFT);
|
||
334 | q4 = vec_ctf(qmat[8], QMAT_SHIFT);
|
||
335 | q5 = vec_ctf(qmat[10], QMAT_SHIFT);
|
||
336 | q6 = vec_ctf(qmat[12], QMAT_SHIFT);
|
||
337 | q7 = vec_ctf(qmat[14], QMAT_SHIFT);
|
||
338 | |||
339 | row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias), |
||
340 | vec_cmpgt(row0, zero)); |
||
341 | row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias), |
||
342 | vec_cmpgt(row1, zero)); |
||
343 | row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias), |
||
344 | vec_cmpgt(row2, zero)); |
||
345 | row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias), |
||
346 | vec_cmpgt(row3, zero)); |
||
347 | row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias), |
||
348 | vec_cmpgt(row4, zero)); |
||
349 | row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias), |
||
350 | vec_cmpgt(row5, zero)); |
||
351 | row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias), |
||
352 | vec_cmpgt(row6, zero)); |
||
353 | row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias), |
||
354 | vec_cmpgt(row7, zero)); |
||
355 | |||
356 | q0 = vec_ctf(qmat[1], QMAT_SHIFT);
|
||
357 | q1 = vec_ctf(qmat[3], QMAT_SHIFT);
|
||
358 | q2 = vec_ctf(qmat[5], QMAT_SHIFT);
|
||
359 | q3 = vec_ctf(qmat[7], QMAT_SHIFT);
|
||
360 | q4 = vec_ctf(qmat[9], QMAT_SHIFT);
|
||
361 | q5 = vec_ctf(qmat[11], QMAT_SHIFT);
|
||
362 | q6 = vec_ctf(qmat[13], QMAT_SHIFT);
|
||
363 | q7 = vec_ctf(qmat[15], QMAT_SHIFT);
|
||
364 | |||
365 | alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias), |
||
366 | vec_cmpgt(alt0, zero)); |
||
367 | alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias), |
||
368 | vec_cmpgt(alt1, zero)); |
||
369 | alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias), |
||
370 | vec_cmpgt(alt2, zero)); |
||
371 | alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias), |
||
372 | vec_cmpgt(alt3, zero)); |
||
373 | alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias), |
||
374 | vec_cmpgt(alt4, zero)); |
||
375 | alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias), |
||
376 | vec_cmpgt(alt5, zero)); |
||
377 | alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias), |
||
378 | vec_cmpgt(alt6, zero)); |
||
379 | alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias), |
||
380 | vec_cmpgt(alt7, zero)); |
||
381 | } |
||
382 | |||
383 | 115329f1 | Diego Biurrun | |
384 | 05c4072b | Michael Niedermayer | } |
385 | |||
386 | // Store the data back into the original block
|
||
387 | { |
||
388 | vector signed short data0, data1, data2, data3, data4, data5, data6, data7; |
||
389 | |||
390 | data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0)); |
||
391 | data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0)); |
||
392 | data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0)); |
||
393 | data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0)); |
||
394 | data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0)); |
||
395 | data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0)); |
||
396 | data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0)); |
||
397 | data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0)); |
||
398 | |||
399 | { |
||
400 | // Clamp for overflow
|
||
401 | vector signed int max_q_int, min_q_int; |
||
402 | vector signed short max_q, min_q; |
||
403 | |||
404 | LOAD4(max_q_int, &(s->max_qcoeff)); |
||
405 | LOAD4(min_q_int, &(s->min_qcoeff)); |
||
406 | |||
407 | max_q = vec_pack(max_q_int, max_q_int); |
||
408 | min_q = vec_pack(min_q_int, min_q_int); |
||
409 | |||
410 | data0 = vec_max(vec_min(data0, max_q), min_q); |
||
411 | data1 = vec_max(vec_min(data1, max_q), min_q); |
||
412 | data2 = vec_max(vec_min(data2, max_q), min_q); |
||
413 | data4 = vec_max(vec_min(data4, max_q), min_q); |
||
414 | data5 = vec_max(vec_min(data5, max_q), min_q); |
||
415 | data6 = vec_max(vec_min(data6, max_q), min_q); |
||
416 | data7 = vec_max(vec_min(data7, max_q), min_q); |
||
417 | } |
||
418 | |||
419 | aab34ca0 | Michael Niedermayer | { |
420 | 05c4072b | Michael Niedermayer | vector bool char zero_01, zero_23, zero_45, zero_67; |
421 | vector signed char scanIndices_01, scanIndices_23, scanIndices_45, scanIndices_67; |
||
422 | vector signed char negOne = vec_splat_s8(-1); |
||
423 | vector signed char* scanPtr = |
||
424 | (vector signed char*)(s->intra_scantable.inverse); |
||
425 | aab34ca0 | Michael Niedermayer | signed char lastNonZeroChar; |
426 | 05c4072b | Michael Niedermayer | |
427 | // Determine the largest non-zero index.
|
||
428 | aab34ca0 | Michael Niedermayer | zero_01 = vec_pack(vec_cmpeq(data0, (vector signed short)zero), |
429 | vec_cmpeq(data1, (vector signed short)zero)); |
||
430 | zero_23 = vec_pack(vec_cmpeq(data2, (vector signed short)zero), |
||
431 | vec_cmpeq(data3, (vector signed short)zero)); |
||
432 | zero_45 = vec_pack(vec_cmpeq(data4, (vector signed short)zero), |
||
433 | vec_cmpeq(data5, (vector signed short)zero)); |
||
434 | zero_67 = vec_pack(vec_cmpeq(data6, (vector signed short)zero), |
||
435 | vec_cmpeq(data7, (vector signed short)zero)); |
||
436 | 05c4072b | Michael Niedermayer | |
437 | // 64 biggest values
|
||
438 | scanIndices_01 = vec_sel(scanPtr[0], negOne, zero_01);
|
||
439 | scanIndices_23 = vec_sel(scanPtr[1], negOne, zero_23);
|
||
440 | scanIndices_45 = vec_sel(scanPtr[2], negOne, zero_45);
|
||
441 | scanIndices_67 = vec_sel(scanPtr[3], negOne, zero_67);
|
||
442 | |||
443 | // 32 largest values
|
||
444 | scanIndices_01 = vec_max(scanIndices_01, scanIndices_23); |
||
445 | scanIndices_45 = vec_max(scanIndices_45, scanIndices_67); |
||
446 | |||
447 | // 16 largest values
|
||
448 | scanIndices_01 = vec_max(scanIndices_01, scanIndices_45); |
||
449 | |||
450 | // 8 largest values
|
||
451 | scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), |
||
452 | vec_mergel(scanIndices_01, negOne)); |
||
453 | |||
454 | // 4 largest values
|
||
455 | scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), |
||
456 | vec_mergel(scanIndices_01, negOne)); |
||
457 | |||
458 | // 2 largest values
|
||
459 | scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), |
||
460 | vec_mergel(scanIndices_01, negOne)); |
||
461 | |||
462 | // largest value
|
||
463 | scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), |
||
464 | vec_mergel(scanIndices_01, negOne)); |
||
465 | |||
466 | scanIndices_01 = vec_splat(scanIndices_01, 0);
|
||
467 | |||
468 | |||
469 | vec_ste(scanIndices_01, 0, &lastNonZeroChar);
|
||
470 | |||
471 | lastNonZero = lastNonZeroChar; |
||
472 | 115329f1 | Diego Biurrun | |
473 | 05c4072b | Michael Niedermayer | // While the data is still in vectors we check for the transpose IDCT permute
|
474 | // and handle it using the vector unit if we can. This is the permute used
|
||
475 | // by the altivec idct, so it is common when using the altivec dct.
|
||
476 | |||
477 | b0368839 | Michael Niedermayer | if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) |
478 | 05c4072b | Michael Niedermayer | { |
479 | TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); |
||
480 | } |
||
481 | |||
482 | vec_st(data0, 0, data);
|
||
483 | vec_st(data1, 16, data);
|
||
484 | vec_st(data2, 32, data);
|
||
485 | vec_st(data3, 48, data);
|
||
486 | vec_st(data4, 64, data);
|
||
487 | vec_st(data5, 80, data);
|
||
488 | vec_st(data6, 96, data);
|
||
489 | vec_st(data7, 112, data);
|
||
490 | aab34ca0 | Michael Niedermayer | } |
491 | 05c4072b | Michael Niedermayer | } |
492 | |||
493 | // special handling of block[0]
|
||
494 | if (s->mb_intra)
|
||
495 | { |
||
496 | if (!s->h263_aic)
|
||
497 | { |
||
498 | if (n < 4) |
||
499 | oldBaseValue /= s->y_dc_scale; |
||
500 | else
|
||
501 | oldBaseValue /= s->c_dc_scale; |
||
502 | } |
||
503 | |||
504 | // Divide by 8, rounding the result
|
||
505 | data[0] = (oldBaseValue + 4) >> 3; |
||
506 | } |
||
507 | |||
508 | // We handled the tranpose permutation above and we don't
|
||
509 | // need to permute the "no" permutation case.
|
||
510 | if ((lastNonZero > 0) && |
||
511 | b0368839 | Michael Niedermayer | (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) && |
512 | (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) |
||
513 | 05c4072b | Michael Niedermayer | { |
514 | 10564521 | Michael Niedermayer | ff_block_permute(data, s->dsp.idct_permutation, |
515 | 05c4072b | Michael Niedermayer | s->intra_scantable.scantable, lastNonZero); |
516 | } |
||
517 | |||
518 | return lastNonZero;
|
||
519 | } |
||
520 | 3b991c54 | Romain Dolbeau | #undef FOUROF
|
521 | 05c4072b | Michael Niedermayer | |
522 | 744ac4be | Michael Niedermayer | /*
|
523 | AltiVec version of dct_unquantize_h263
|
||
524 | this code assumes `block' is 16 bytes-aligned
|
||
525 | */
|
||
526 | 115329f1 | Diego Biurrun | void dct_unquantize_h263_altivec(MpegEncContext *s,
|
527 | 744ac4be | Michael Niedermayer | DCTELEM *block, int n, int qscale) |
528 | { |
||
529 | e45a2872 | Romain Dolbeau | POWERPC_PERF_DECLARE(altivec_dct_unquantize_h263_num, 1);
|
530 | 744ac4be | Michael Niedermayer | int i, level, qmul, qadd;
|
531 | int nCoeffs;
|
||
532 | 115329f1 | Diego Biurrun | |
533 | 744ac4be | Michael Niedermayer | assert(s->block_last_index[n]>=0);
|
534 | db40a39a | Michael Niedermayer | |
535 | e45a2872 | Romain Dolbeau | POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1);
|
536 | 115329f1 | Diego Biurrun | |
537 | 744ac4be | Michael Niedermayer | qadd = (qscale - 1) | 1; |
538 | qmul = qscale << 1;
|
||
539 | 115329f1 | Diego Biurrun | |
540 | 744ac4be | Michael Niedermayer | if (s->mb_intra) {
|
541 | if (!s->h263_aic) {
|
||
542 | 115329f1 | Diego Biurrun | if (n < 4) |
543 | 744ac4be | Michael Niedermayer | block[0] = block[0] * s->y_dc_scale; |
544 | else
|
||
545 | block[0] = block[0] * s->c_dc_scale; |
||
546 | }else
|
||
547 | qadd = 0;
|
||
548 | i = 1;
|
||
549 | 115329f1 | Diego Biurrun | nCoeffs= 63; //does not allways use zigzag table |
550 | 744ac4be | Michael Niedermayer | } else {
|
551 | i = 0;
|
||
552 | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; |
||
553 | } |
||
554 | |||
555 | db40a39a | Michael Niedermayer | #ifdef ALTIVEC_USE_REFERENCE_C_CODE
|
556 | 744ac4be | Michael Niedermayer | for(;i<=nCoeffs;i++) {
|
557 | level = block[i]; |
||
558 | if (level) {
|
||
559 | if (level < 0) { |
||
560 | level = level * qmul - qadd; |
||
561 | } else {
|
||
562 | level = level * qmul + qadd; |
||
563 | } |
||
564 | block[i] = level; |
||
565 | } |
||
566 | } |
||
567 | db40a39a | Michael Niedermayer | #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
568 | 744ac4be | Michael Niedermayer | { |
569 | aab34ca0 | Michael Niedermayer | register const_vector signed short vczero = (const_vector signed short)vec_splat_s16(0); |
570 | 744ac4be | Michael Niedermayer | short __attribute__ ((aligned(16))) qmul8[] = |
571 | { |
||
572 | qmul, qmul, qmul, qmul, |
||
573 | qmul, qmul, qmul, qmul |
||
574 | }; |
||
575 | short __attribute__ ((aligned(16))) qadd8[] = |
||
576 | { |
||
577 | qadd, qadd, qadd, qadd, |
||
578 | qadd, qadd, qadd, qadd |
||
579 | }; |
||
580 | short __attribute__ ((aligned(16))) nqadd8[] = |
||
581 | { |
||
582 | -qadd, -qadd, -qadd, -qadd, |
||
583 | -qadd, -qadd, -qadd, -qadd |
||
584 | }; |
||
585 | aab34ca0 | Michael Niedermayer | register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; |
586 | 744ac4be | Michael Niedermayer | register vector bool short blockv_null, blockv_neg; |
587 | register short backup_0 = block[0]; |
||
588 | register int j = 0; |
||
589 | 115329f1 | Diego Biurrun | |
590 | 744ac4be | Michael Niedermayer | qmulv = vec_ld(0, qmul8);
|
591 | qaddv = vec_ld(0, qadd8);
|
||
592 | nqaddv = vec_ld(0, nqadd8);
|
||
593 | |||
594 | db40a39a | Michael Niedermayer | #if 0 // block *is* 16 bytes-aligned, it seems.
|
595 | 744ac4be | Michael Niedermayer | // first make sure block[j] is 16 bytes-aligned
|
596 | for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) {
|
||
597 | level = block[j];
|
||
598 | if (level) {
|
||
599 | if (level < 0) {
|
||
600 | level = level * qmul - qadd;
|
||
601 | } else {
|
||
602 | level = level * qmul + qadd;
|
||
603 | }
|
||
604 | block[j] = level;
|
||
605 | }
|
||
606 | }
|
||
607 | db40a39a | Michael Niedermayer | #endif
|
608 | 115329f1 | Diego Biurrun | |
609 | 744ac4be | Michael Niedermayer | // vectorize all the 16 bytes-aligned blocks
|
610 | // of 8 elements
|
||
611 | for(; (j + 7) <= nCoeffs ; j+=8) |
||
612 | { |
||
613 | blockv = vec_ld(j << 1, block);
|
||
614 | blockv_neg = vec_cmplt(blockv, vczero); |
||
615 | blockv_null = vec_cmpeq(blockv, vczero); |
||
616 | // choose between +qadd or -qadd as the third operand
|
||
617 | temp1 = vec_sel(qaddv, nqaddv, blockv_neg); |
||
618 | // multiply & add (block{i,i+7} * qmul [+-] qadd)
|
||
619 | temp1 = vec_mladd(blockv, qmulv, temp1); |
||
620 | // put 0 where block[{i,i+7} used to have 0
|
||
621 | blockv = vec_sel(temp1, blockv, blockv_null); |
||
622 | vec_st(blockv, j << 1, block);
|
||
623 | } |
||
624 | |||
625 | // if nCoeffs isn't a multiple of 8, finish the job
|
||
626 | // using good old scalar units.
|
||
627 | // (we could do it using a truncated vector,
|
||
628 | // but I'm not sure it's worth the hassle)
|
||
629 | for(; j <= nCoeffs ; j++) {
|
||
630 | level = block[j]; |
||
631 | if (level) {
|
||
632 | if (level < 0) { |
||
633 | level = level * qmul - qadd; |
||
634 | } else {
|
||
635 | level = level * qmul + qadd; |
||
636 | } |
||
637 | block[j] = level; |
||
638 | } |
||
639 | } |
||
640 | 115329f1 | Diego Biurrun | |
641 | 744ac4be | Michael Niedermayer | if (i == 1) |
642 | { // cheat. this avoid special-casing the first iteration
|
||
643 | block[0] = backup_0;
|
||
644 | } |
||
645 | } |
||
646 | db40a39a | Michael Niedermayer | #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
647 | |||
648 | e45a2872 | Romain Dolbeau | POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63);
|
649 | 744ac4be | Michael Niedermayer | } |