ffmpeg / libavcodec / ppc / idct_altivec.c @ e3905ce0
History  View  Annotate  Download (11.2 KB)
1 
/*


2 
* Copyright (c) 2001 Michel Lespinasse

3 
*

4 
* This file is part of FFmpeg.

5 
*

6 
* FFmpeg is free software; you can redistribute it and/or

7 
* modify it under the terms of the GNU Lesser General Public

8 
* License as published by the Free Software Foundation; either

9 
* version 2.1 of the License, or (at your option) any later version.

10 
*

11 
* FFmpeg is distributed in the hope that it will be useful,

12 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

13 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

14 
* Lesser General Public License for more details.

15 
*

16 
* You should have received a copy of the GNU Lesser General Public

17 
* License along with FFmpeg; if not, write to the Free Software

18 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

19 
*/

20  
21 
/*

22 
* NOTE: This code is based on GPL code from the libmpeg2 project. The

23 
* author, Michel Lespinasses, has given explicit permission to release

24 
* under LGPL as part of ffmpeg.

25 
*/

26  
27 
/*

28 
* FFMpeg integration by Dieter Shirley

29 
*

30 
* This file is a direct copy of the altivec idct module from the libmpeg2

31 
* project. I've deleted all of the libmpeg2 specific code, renamed the functions and

32 
* reordered the function parameters. The only change to the IDCT function

33 
* itself was to factor out the partial transposition, and to perform a full

34 
* transpose at the end of the function.

35 
*/

36  
37  
38 
#include <stdlib.h> /* malloc(), free() */ 
39 
#include <string.h> 
40 
#include "libavcodec/dsputil.h" 
41  
42 
#include "gcc_fixes.h" 
43  
44 
#include "dsputil_ppc.h" 
45  
46 
#define vector_s16_t vector signed short 
47 
#define const_vector_s16_t const vector signed short 
48 
#define vector_u16_t vector unsigned short 
49 
#define vector_s8_t vector signed char 
50 
#define vector_u8_t vector unsigned char 
51 
#define vector_s32_t vector signed int 
52 
#define vector_u32_t vector unsigned int 
53  
54 
#define IDCT_HALF \

55 
/* 1st stage */ \

56 
t1 = vec_mradds (a1, vx7, vx1 ); \ 
57 
t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \ 
58 
t7 = vec_mradds (a2, vx5, vx3); \ 
59 
t3 = vec_mradds (ma2, vx3, vx5); \ 
60 
\ 
61 
/* 2nd stage */ \

62 
t5 = vec_adds (vx0, vx4); \ 
63 
t0 = vec_subs (vx0, vx4); \ 
64 
t2 = vec_mradds (a0, vx6, vx2); \ 
65 
t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \ 
66 
t6 = vec_adds (t8, t3); \ 
67 
t3 = vec_subs (t8, t3); \ 
68 
t8 = vec_subs (t1, t7); \ 
69 
t1 = vec_adds (t1, t7); \ 
70 
\ 
71 
/* 3rd stage */ \

72 
t7 = vec_adds (t5, t2); \ 
73 
t2 = vec_subs (t5, t2); \ 
74 
t5 = vec_adds (t0, t4); \ 
75 
t0 = vec_subs (t0, t4); \ 
76 
t4 = vec_subs (t8, t3); \ 
77 
t3 = vec_adds (t8, t3); \ 
78 
\ 
79 
/* 4th stage */ \

80 
vy0 = vec_adds (t7, t1); \ 
81 
vy7 = vec_subs (t7, t1); \ 
82 
vy1 = vec_mradds (c4, t3, t5); \ 
83 
vy6 = vec_mradds (mc4, t3, t5); \ 
84 
vy2 = vec_mradds (c4, t4, t0); \ 
85 
vy5 = vec_mradds (mc4, t4, t0); \ 
86 
vy3 = vec_adds (t2, t6); \ 
87 
vy4 = vec_subs (t2, t6); 
88  
89  
90 
#define IDCT \

91 
vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \ 
92 
vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ 
93 
vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias; \ 
94 
vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8; \ 
95 
vector_u16_t shift; \ 
96 
\ 
97 
c4 = vec_splat (constants[0], 0); \ 
98 
a0 = vec_splat (constants[0], 1); \ 
99 
a1 = vec_splat (constants[0], 2); \ 
100 
a2 = vec_splat (constants[0], 3); \ 
101 
mc4 = vec_splat (constants[0], 4); \ 
102 
ma2 = vec_splat (constants[0], 5); \ 
103 
bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3); \ 
104 
\ 
105 
zero = vec_splat_s16 (0); \

106 
shift = vec_splat_u16 (4); \

107 
\ 
108 
vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \ 
109 
vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \ 
110 
vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \ 
111 
vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \ 
112 
vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \ 
113 
vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \ 
114 
vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \ 
115 
vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \ 
116 
\ 
117 
IDCT_HALF \ 
118 
\ 
119 
vx0 = vec_mergeh (vy0, vy4); \ 
120 
vx1 = vec_mergel (vy0, vy4); \ 
121 
vx2 = vec_mergeh (vy1, vy5); \ 
122 
vx3 = vec_mergel (vy1, vy5); \ 
123 
vx4 = vec_mergeh (vy2, vy6); \ 
124 
vx5 = vec_mergel (vy2, vy6); \ 
125 
vx6 = vec_mergeh (vy3, vy7); \ 
126 
vx7 = vec_mergel (vy3, vy7); \ 
127 
\ 
128 
vy0 = vec_mergeh (vx0, vx4); \ 
129 
vy1 = vec_mergel (vx0, vx4); \ 
130 
vy2 = vec_mergeh (vx1, vx5); \ 
131 
vy3 = vec_mergel (vx1, vx5); \ 
132 
vy4 = vec_mergeh (vx2, vx6); \ 
133 
vy5 = vec_mergel (vx2, vx6); \ 
134 
vy6 = vec_mergeh (vx3, vx7); \ 
135 
vy7 = vec_mergel (vx3, vx7); \ 
136 
\ 
137 
vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \ 
138 
vx1 = vec_mergel (vy0, vy4); \ 
139 
vx2 = vec_mergeh (vy1, vy5); \ 
140 
vx3 = vec_mergel (vy1, vy5); \ 
141 
vx4 = vec_mergeh (vy2, vy6); \ 
142 
vx5 = vec_mergel (vy2, vy6); \ 
143 
vx6 = vec_mergeh (vy3, vy7); \ 
144 
vx7 = vec_mergel (vy3, vy7); \ 
145 
\ 
146 
IDCT_HALF \ 
147 
\ 
148 
shift = vec_splat_u16 (6); \

149 
vx0 = vec_sra (vy0, shift); \ 
150 
vx1 = vec_sra (vy1, shift); \ 
151 
vx2 = vec_sra (vy2, shift); \ 
152 
vx3 = vec_sra (vy3, shift); \ 
153 
vx4 = vec_sra (vy4, shift); \ 
154 
vx5 = vec_sra (vy5, shift); \ 
155 
vx6 = vec_sra (vy6, shift); \ 
156 
vx7 = vec_sra (vy7, shift); 
157  
158  
159 
static const_vector_s16_t constants[5] = { 
160 
AVV(23170, 13573, 6518, 21895, 23170, 21895, 32, 31), 
161 
AVV(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725), 
162 
AVV(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521), 
163 
AVV(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692), 
164 
AVV(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722) 
165 
}; 
166  
167 
void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block) 
168 
{ 
169 
POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);

170 
vector_u8_t tmp; 
171  
172 
#ifdef CONFIG_POWERPC_PERF

173 
POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);

174 
#endif

175 
IDCT 
176  
177 
#define COPY(dest,src) \

178 
tmp = vec_packsu (src, src); \ 
179 
vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ 
180 
vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); 
181  
182 
COPY (dest, vx0) dest += stride; 
183 
COPY (dest, vx1) dest += stride; 
184 
COPY (dest, vx2) dest += stride; 
185 
COPY (dest, vx3) dest += stride; 
186 
COPY (dest, vx4) dest += stride; 
187 
COPY (dest, vx5) dest += stride; 
188 
COPY (dest, vx6) dest += stride; 
189 
COPY (dest, vx7) 
190  
191 
POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);

192 
} 
193  
194 
void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block) 
195 
{ 
196 
POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);

197 
vector_u8_t tmp; 
198 
vector_s16_t tmp2, tmp3; 
199 
vector_u8_t perm0; 
200 
vector_u8_t perm1; 
201 
vector_u8_t p0, p1, p; 
202  
203 
#ifdef CONFIG_POWERPC_PERF

204 
POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);

205 
#endif

206  
207 
IDCT 
208  
209 
p0 = vec_lvsl (0, dest);

210 
p1 = vec_lvsl (stride, dest); 
211 
p = vec_splat_u8 (1);

212 
perm0 = vec_mergeh (p, p0); 
213 
perm1 = vec_mergeh (p, p1); 
214  
215 
#define ADD(dest,src,perm) \

216 
/* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \

217 
tmp = vec_ld (0, dest); \

218 
tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm); \ 
219 
tmp3 = vec_adds (tmp2, src); \ 
220 
tmp = vec_packsu (tmp3, tmp3); \ 
221 
vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ 
222 
vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); 
223  
224 
ADD (dest, vx0, perm0) dest += stride; 
225 
ADD (dest, vx1, perm1) dest += stride; 
226 
ADD (dest, vx2, perm0) dest += stride; 
227 
ADD (dest, vx3, perm1) dest += stride; 
228 
ADD (dest, vx4, perm0) dest += stride; 
229 
ADD (dest, vx5, perm1) dest += stride; 
230 
ADD (dest, vx6, perm0) dest += stride; 
231 
ADD (dest, vx7, perm1) 
232  
233 
POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);

234 
} 
235 