ffmpeg / libavcodec / ppc / idct_altivec.c @ b550bfaa
History | View | Annotate | Download (11.3 KB)
1 |
/*
|
---|---|
2 |
* Copyright (c) 2001 Michel Lespinasse
|
3 |
*
|
4 |
* This file is part of FFmpeg.
|
5 |
*
|
6 |
* FFmpeg is free software; you can redistribute it and/or
|
7 |
* modify it under the terms of the GNU Lesser General Public
|
8 |
* License as published by the Free Software Foundation; either
|
9 |
* version 2.1 of the License, or (at your option) any later version.
|
10 |
*
|
11 |
* FFmpeg is distributed in the hope that it will be useful,
|
12 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14 |
* Lesser General Public License for more details.
|
15 |
*
|
16 |
* You should have received a copy of the GNU Lesser General Public
|
17 |
* License along with FFmpeg; if not, write to the Free Software
|
18 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
19 |
*
|
20 |
*/
|
21 |
|
22 |
/*
|
23 |
* NOTE: This code is based on GPL code from the libmpeg2 project. The
|
24 |
* author, Michel Lespinasses, has given explicit permission to release
|
25 |
* under LGPL as part of ffmpeg.
|
26 |
*
|
27 |
*/
|
28 |
|
29 |
/*
|
30 |
* FFMpeg integration by Dieter Shirley
|
31 |
*
|
32 |
* This file is a direct copy of the altivec idct module from the libmpeg2
|
33 |
* project. I've deleted all of the libmpeg2 specific code, renamed the functions and
|
34 |
* re-ordered the function parameters. The only change to the IDCT function
|
35 |
* itself was to factor out the partial transposition, and to perform a full
|
36 |
* transpose at the end of the function.
|
37 |
*/
|
38 |
|
39 |
|
40 |
#include <stdlib.h> /* malloc(), free() */ |
41 |
#include <string.h> |
42 |
#include "dsputil.h" |
43 |
|
44 |
#include "gcc_fixes.h" |
45 |
|
46 |
#include "dsputil_altivec.h" |
47 |
|
48 |
#define vector_s16_t vector signed short |
49 |
#define const_vector_s16_t const_vector signed short |
50 |
#define vector_u16_t vector unsigned short |
51 |
#define vector_s8_t vector signed char |
52 |
#define vector_u8_t vector unsigned char |
53 |
#define vector_s32_t vector signed int |
54 |
#define vector_u32_t vector unsigned int |
55 |
|
56 |
#define IDCT_HALF \
|
57 |
/* 1st stage */ \
|
58 |
t1 = vec_mradds (a1, vx7, vx1 ); \ |
59 |
t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \ |
60 |
t7 = vec_mradds (a2, vx5, vx3); \ |
61 |
t3 = vec_mradds (ma2, vx3, vx5); \ |
62 |
\ |
63 |
/* 2nd stage */ \
|
64 |
t5 = vec_adds (vx0, vx4); \ |
65 |
t0 = vec_subs (vx0, vx4); \ |
66 |
t2 = vec_mradds (a0, vx6, vx2); \ |
67 |
t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \ |
68 |
t6 = vec_adds (t8, t3); \ |
69 |
t3 = vec_subs (t8, t3); \ |
70 |
t8 = vec_subs (t1, t7); \ |
71 |
t1 = vec_adds (t1, t7); \ |
72 |
\ |
73 |
/* 3rd stage */ \
|
74 |
t7 = vec_adds (t5, t2); \ |
75 |
t2 = vec_subs (t5, t2); \ |
76 |
t5 = vec_adds (t0, t4); \ |
77 |
t0 = vec_subs (t0, t4); \ |
78 |
t4 = vec_subs (t8, t3); \ |
79 |
t3 = vec_adds (t8, t3); \ |
80 |
\ |
81 |
/* 4th stage */ \
|
82 |
vy0 = vec_adds (t7, t1); \ |
83 |
vy7 = vec_subs (t7, t1); \ |
84 |
vy1 = vec_mradds (c4, t3, t5); \ |
85 |
vy6 = vec_mradds (mc4, t3, t5); \ |
86 |
vy2 = vec_mradds (c4, t4, t0); \ |
87 |
vy5 = vec_mradds (mc4, t4, t0); \ |
88 |
vy3 = vec_adds (t2, t6); \ |
89 |
vy4 = vec_subs (t2, t6); |
90 |
|
91 |
|
92 |
#define IDCT \
|
93 |
vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \ |
94 |
vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ |
95 |
vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias; \ |
96 |
vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8; \ |
97 |
vector_u16_t shift; \ |
98 |
\ |
99 |
c4 = vec_splat (constants[0], 0); \ |
100 |
a0 = vec_splat (constants[0], 1); \ |
101 |
a1 = vec_splat (constants[0], 2); \ |
102 |
a2 = vec_splat (constants[0], 3); \ |
103 |
mc4 = vec_splat (constants[0], 4); \ |
104 |
ma2 = vec_splat (constants[0], 5); \ |
105 |
bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3); \ |
106 |
\ |
107 |
zero = vec_splat_s16 (0); \
|
108 |
shift = vec_splat_u16 (4); \
|
109 |
\ |
110 |
vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \ |
111 |
vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \ |
112 |
vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \ |
113 |
vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \ |
114 |
vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \ |
115 |
vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \ |
116 |
vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \ |
117 |
vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \ |
118 |
\ |
119 |
IDCT_HALF \ |
120 |
\ |
121 |
vx0 = vec_mergeh (vy0, vy4); \ |
122 |
vx1 = vec_mergel (vy0, vy4); \ |
123 |
vx2 = vec_mergeh (vy1, vy5); \ |
124 |
vx3 = vec_mergel (vy1, vy5); \ |
125 |
vx4 = vec_mergeh (vy2, vy6); \ |
126 |
vx5 = vec_mergel (vy2, vy6); \ |
127 |
vx6 = vec_mergeh (vy3, vy7); \ |
128 |
vx7 = vec_mergel (vy3, vy7); \ |
129 |
\ |
130 |
vy0 = vec_mergeh (vx0, vx4); \ |
131 |
vy1 = vec_mergel (vx0, vx4); \ |
132 |
vy2 = vec_mergeh (vx1, vx5); \ |
133 |
vy3 = vec_mergel (vx1, vx5); \ |
134 |
vy4 = vec_mergeh (vx2, vx6); \ |
135 |
vy5 = vec_mergel (vx2, vx6); \ |
136 |
vy6 = vec_mergeh (vx3, vx7); \ |
137 |
vy7 = vec_mergel (vx3, vx7); \ |
138 |
\ |
139 |
vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \ |
140 |
vx1 = vec_mergel (vy0, vy4); \ |
141 |
vx2 = vec_mergeh (vy1, vy5); \ |
142 |
vx3 = vec_mergel (vy1, vy5); \ |
143 |
vx4 = vec_mergeh (vy2, vy6); \ |
144 |
vx5 = vec_mergel (vy2, vy6); \ |
145 |
vx6 = vec_mergeh (vy3, vy7); \ |
146 |
vx7 = vec_mergel (vy3, vy7); \ |
147 |
\ |
148 |
IDCT_HALF \ |
149 |
\ |
150 |
shift = vec_splat_u16 (6); \
|
151 |
vx0 = vec_sra (vy0, shift); \ |
152 |
vx1 = vec_sra (vy1, shift); \ |
153 |
vx2 = vec_sra (vy2, shift); \ |
154 |
vx3 = vec_sra (vy3, shift); \ |
155 |
vx4 = vec_sra (vy4, shift); \ |
156 |
vx5 = vec_sra (vy5, shift); \ |
157 |
vx6 = vec_sra (vy6, shift); \ |
158 |
vx7 = vec_sra (vy7, shift); |
159 |
|
160 |
|
161 |
static const_vector_s16_t constants[5] = { |
162 |
(vector_s16_t) AVV(23170, 13573, 6518, 21895, -23170, -21895, 32, 31), |
163 |
(vector_s16_t) AVV(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725), |
164 |
(vector_s16_t) AVV(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521), |
165 |
(vector_s16_t) AVV(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692), |
166 |
(vector_s16_t) AVV(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722) |
167 |
}; |
168 |
|
169 |
void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block) |
170 |
{ |
171 |
POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
|
172 |
vector_u8_t tmp; |
173 |
|
174 |
#ifdef CONFIG_POWERPC_PERF
|
175 |
POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
|
176 |
#endif
|
177 |
IDCT |
178 |
|
179 |
#define COPY(dest,src) \
|
180 |
tmp = vec_packsu (src, src); \ |
181 |
vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ |
182 |
vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); |
183 |
|
184 |
COPY (dest, vx0) dest += stride; |
185 |
COPY (dest, vx1) dest += stride; |
186 |
COPY (dest, vx2) dest += stride; |
187 |
COPY (dest, vx3) dest += stride; |
188 |
COPY (dest, vx4) dest += stride; |
189 |
COPY (dest, vx5) dest += stride; |
190 |
COPY (dest, vx6) dest += stride; |
191 |
COPY (dest, vx7) |
192 |
|
193 |
POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
|
194 |
} |
195 |
|
196 |
void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block) |
197 |
{ |
198 |
POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
|
199 |
vector_u8_t tmp; |
200 |
vector_s16_t tmp2, tmp3; |
201 |
vector_u8_t perm0; |
202 |
vector_u8_t perm1; |
203 |
vector_u8_t p0, p1, p; |
204 |
|
205 |
#ifdef CONFIG_POWERPC_PERF
|
206 |
POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
|
207 |
#endif
|
208 |
|
209 |
IDCT |
210 |
|
211 |
p0 = vec_lvsl (0, dest);
|
212 |
p1 = vec_lvsl (stride, dest); |
213 |
p = vec_splat_u8 (-1);
|
214 |
perm0 = vec_mergeh (p, p0); |
215 |
perm1 = vec_mergeh (p, p1); |
216 |
|
217 |
#define ADD(dest,src,perm) \
|
218 |
/* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \
|
219 |
tmp = vec_ld (0, dest); \
|
220 |
tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm); \ |
221 |
tmp3 = vec_adds (tmp2, src); \ |
222 |
tmp = vec_packsu (tmp3, tmp3); \ |
223 |
vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ |
224 |
vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); |
225 |
|
226 |
ADD (dest, vx0, perm0) dest += stride; |
227 |
ADD (dest, vx1, perm1) dest += stride; |
228 |
ADD (dest, vx2, perm0) dest += stride; |
229 |
ADD (dest, vx3, perm1) dest += stride; |
230 |
ADD (dest, vx4, perm0) dest += stride; |
231 |
ADD (dest, vx5, perm1) dest += stride; |
232 |
ADD (dest, vx6, perm0) dest += stride; |
233 |
ADD (dest, vx7, perm1) |
234 |
|
235 |
POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
|
236 |
} |
237 |
|