ffmpeg / libavcodec / ppc / idct_altivec.c @ 80a61f08
History | View | Annotate | Download (11.2 KB)
1 |
/*
|
---|---|
2 |
* Copyright (c) 2001 Michel Lespinasse
|
3 |
*
|
4 |
* This file is part of FFmpeg.
|
5 |
*
|
6 |
* FFmpeg is free software; you can redistribute it and/or
|
7 |
* modify it under the terms of the GNU Lesser General Public
|
8 |
* License as published by the Free Software Foundation; either
|
9 |
* version 2.1 of the License, or (at your option) any later version.
|
10 |
*
|
11 |
* FFmpeg is distributed in the hope that it will be useful,
|
12 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14 |
* Lesser General Public License for more details.
|
15 |
*
|
16 |
* You should have received a copy of the GNU Lesser General Public
|
17 |
* License along with FFmpeg; if not, write to the Free Software
|
18 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
19 |
*/
|
20 |
|
21 |
/*
|
22 |
* NOTE: This code is based on GPL code from the libmpeg2 project. The
|
23 |
* author, Michel Lespinasses, has given explicit permission to release
|
24 |
* under LGPL as part of ffmpeg.
|
25 |
*/
|
26 |
|
27 |
/*
|
28 |
* FFMpeg integration by Dieter Shirley
|
29 |
*
|
30 |
* This file is a direct copy of the altivec idct module from the libmpeg2
|
31 |
* project. I've deleted all of the libmpeg2 specific code, renamed the functions and
|
32 |
* re-ordered the function parameters. The only change to the IDCT function
|
33 |
* itself was to factor out the partial transposition, and to perform a full
|
34 |
* transpose at the end of the function.
|
35 |
*/
|
36 |
|
37 |
|
38 |
#include <stdlib.h> /* malloc(), free() */ |
39 |
#include <string.h> |
40 |
#include "libavcodec/dsputil.h" |
41 |
|
42 |
#include "gcc_fixes.h" |
43 |
|
44 |
#include "dsputil_ppc.h" |
45 |
|
46 |
#define vector_s16_t vector signed short |
47 |
#define const_vector_s16_t const vector signed short |
48 |
#define vector_u16_t vector unsigned short |
49 |
#define vector_s8_t vector signed char |
50 |
#define vector_u8_t vector unsigned char |
51 |
#define vector_s32_t vector signed int |
52 |
#define vector_u32_t vector unsigned int |
53 |
|
54 |
#define IDCT_HALF \
|
55 |
/* 1st stage */ \
|
56 |
t1 = vec_mradds (a1, vx7, vx1 ); \ |
57 |
t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \ |
58 |
t7 = vec_mradds (a2, vx5, vx3); \ |
59 |
t3 = vec_mradds (ma2, vx3, vx5); \ |
60 |
\ |
61 |
/* 2nd stage */ \
|
62 |
t5 = vec_adds (vx0, vx4); \ |
63 |
t0 = vec_subs (vx0, vx4); \ |
64 |
t2 = vec_mradds (a0, vx6, vx2); \ |
65 |
t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \ |
66 |
t6 = vec_adds (t8, t3); \ |
67 |
t3 = vec_subs (t8, t3); \ |
68 |
t8 = vec_subs (t1, t7); \ |
69 |
t1 = vec_adds (t1, t7); \ |
70 |
\ |
71 |
/* 3rd stage */ \
|
72 |
t7 = vec_adds (t5, t2); \ |
73 |
t2 = vec_subs (t5, t2); \ |
74 |
t5 = vec_adds (t0, t4); \ |
75 |
t0 = vec_subs (t0, t4); \ |
76 |
t4 = vec_subs (t8, t3); \ |
77 |
t3 = vec_adds (t8, t3); \ |
78 |
\ |
79 |
/* 4th stage */ \
|
80 |
vy0 = vec_adds (t7, t1); \ |
81 |
vy7 = vec_subs (t7, t1); \ |
82 |
vy1 = vec_mradds (c4, t3, t5); \ |
83 |
vy6 = vec_mradds (mc4, t3, t5); \ |
84 |
vy2 = vec_mradds (c4, t4, t0); \ |
85 |
vy5 = vec_mradds (mc4, t4, t0); \ |
86 |
vy3 = vec_adds (t2, t6); \ |
87 |
vy4 = vec_subs (t2, t6); |
88 |
|
89 |
|
90 |
#define IDCT \
|
91 |
vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \ |
92 |
vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ |
93 |
vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias; \ |
94 |
vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8; \ |
95 |
vector_u16_t shift; \ |
96 |
\ |
97 |
c4 = vec_splat (constants[0], 0); \ |
98 |
a0 = vec_splat (constants[0], 1); \ |
99 |
a1 = vec_splat (constants[0], 2); \ |
100 |
a2 = vec_splat (constants[0], 3); \ |
101 |
mc4 = vec_splat (constants[0], 4); \ |
102 |
ma2 = vec_splat (constants[0], 5); \ |
103 |
bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3); \ |
104 |
\ |
105 |
zero = vec_splat_s16 (0); \
|
106 |
shift = vec_splat_u16 (4); \
|
107 |
\ |
108 |
vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \ |
109 |
vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \ |
110 |
vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \ |
111 |
vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \ |
112 |
vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \ |
113 |
vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \ |
114 |
vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \ |
115 |
vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \ |
116 |
\ |
117 |
IDCT_HALF \ |
118 |
\ |
119 |
vx0 = vec_mergeh (vy0, vy4); \ |
120 |
vx1 = vec_mergel (vy0, vy4); \ |
121 |
vx2 = vec_mergeh (vy1, vy5); \ |
122 |
vx3 = vec_mergel (vy1, vy5); \ |
123 |
vx4 = vec_mergeh (vy2, vy6); \ |
124 |
vx5 = vec_mergel (vy2, vy6); \ |
125 |
vx6 = vec_mergeh (vy3, vy7); \ |
126 |
vx7 = vec_mergel (vy3, vy7); \ |
127 |
\ |
128 |
vy0 = vec_mergeh (vx0, vx4); \ |
129 |
vy1 = vec_mergel (vx0, vx4); \ |
130 |
vy2 = vec_mergeh (vx1, vx5); \ |
131 |
vy3 = vec_mergel (vx1, vx5); \ |
132 |
vy4 = vec_mergeh (vx2, vx6); \ |
133 |
vy5 = vec_mergel (vx2, vx6); \ |
134 |
vy6 = vec_mergeh (vx3, vx7); \ |
135 |
vy7 = vec_mergel (vx3, vx7); \ |
136 |
\ |
137 |
vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \ |
138 |
vx1 = vec_mergel (vy0, vy4); \ |
139 |
vx2 = vec_mergeh (vy1, vy5); \ |
140 |
vx3 = vec_mergel (vy1, vy5); \ |
141 |
vx4 = vec_mergeh (vy2, vy6); \ |
142 |
vx5 = vec_mergel (vy2, vy6); \ |
143 |
vx6 = vec_mergeh (vy3, vy7); \ |
144 |
vx7 = vec_mergel (vy3, vy7); \ |
145 |
\ |
146 |
IDCT_HALF \ |
147 |
\ |
148 |
shift = vec_splat_u16 (6); \
|
149 |
vx0 = vec_sra (vy0, shift); \ |
150 |
vx1 = vec_sra (vy1, shift); \ |
151 |
vx2 = vec_sra (vy2, shift); \ |
152 |
vx3 = vec_sra (vy3, shift); \ |
153 |
vx4 = vec_sra (vy4, shift); \ |
154 |
vx5 = vec_sra (vy5, shift); \ |
155 |
vx6 = vec_sra (vy6, shift); \ |
156 |
vx7 = vec_sra (vy7, shift); |
157 |
|
158 |
|
159 |
static const_vector_s16_t constants[5] = { |
160 |
{23170, 13573, 6518, 21895, -23170, -21895, 32, 31}, |
161 |
{16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725}, |
162 |
{22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521}, |
163 |
{21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692}, |
164 |
{19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722} |
165 |
}; |
166 |
|
167 |
void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block) |
168 |
{ |
169 |
POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
|
170 |
vector_u8_t tmp; |
171 |
|
172 |
#ifdef CONFIG_POWERPC_PERF
|
173 |
POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
|
174 |
#endif
|
175 |
IDCT |
176 |
|
177 |
#define COPY(dest,src) \
|
178 |
tmp = vec_packsu (src, src); \ |
179 |
vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ |
180 |
vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); |
181 |
|
182 |
COPY (dest, vx0) dest += stride; |
183 |
COPY (dest, vx1) dest += stride; |
184 |
COPY (dest, vx2) dest += stride; |
185 |
COPY (dest, vx3) dest += stride; |
186 |
COPY (dest, vx4) dest += stride; |
187 |
COPY (dest, vx5) dest += stride; |
188 |
COPY (dest, vx6) dest += stride; |
189 |
COPY (dest, vx7) |
190 |
|
191 |
POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
|
192 |
} |
193 |
|
194 |
void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block) |
195 |
{ |
196 |
POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
|
197 |
vector_u8_t tmp; |
198 |
vector_s16_t tmp2, tmp3; |
199 |
vector_u8_t perm0; |
200 |
vector_u8_t perm1; |
201 |
vector_u8_t p0, p1, p; |
202 |
|
203 |
#ifdef CONFIG_POWERPC_PERF
|
204 |
POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
|
205 |
#endif
|
206 |
|
207 |
IDCT |
208 |
|
209 |
p0 = vec_lvsl (0, dest);
|
210 |
p1 = vec_lvsl (stride, dest); |
211 |
p = vec_splat_u8 (-1);
|
212 |
perm0 = vec_mergeh (p, p0); |
213 |
perm1 = vec_mergeh (p, p1); |
214 |
|
215 |
#define ADD(dest,src,perm) \
|
216 |
/* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \
|
217 |
tmp = vec_ld (0, dest); \
|
218 |
tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm); \ |
219 |
tmp3 = vec_adds (tmp2, src); \ |
220 |
tmp = vec_packsu (tmp3, tmp3); \ |
221 |
vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ |
222 |
vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); |
223 |
|
224 |
ADD (dest, vx0, perm0) dest += stride; |
225 |
ADD (dest, vx1, perm1) dest += stride; |
226 |
ADD (dest, vx2, perm0) dest += stride; |
227 |
ADD (dest, vx3, perm1) dest += stride; |
228 |
ADD (dest, vx4, perm0) dest += stride; |
229 |
ADD (dest, vx5, perm1) dest += stride; |
230 |
ADD (dest, vx6, perm0) dest += stride; |
231 |
ADD (dest, vx7, perm1) |
232 |
|
233 |
POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
|
234 |
} |
235 |
|