Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / idct_altivec.c @ e3905ce0

History | View | Annotate | Download (11.2 KB)

1
/*
2
 * Copyright (c) 2001 Michel Lespinasse
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
/*
22
 * NOTE: This code is based on GPL code from the libmpeg2 project.  The
23
 * author, Michel Lespinasses, has given explicit permission to release
24
 * under LGPL as part of ffmpeg.
25
 */
26

    
27
/*
28
 * FFMpeg integration by Dieter Shirley
29
 *
30
 * This file is a direct copy of the altivec idct module from the libmpeg2
31
 * project.  I've deleted all of the libmpeg2 specific code, renamed the functions and
32
 * re-ordered the function parameters.  The only change to the IDCT function
33
 * itself was to factor out the partial transposition, and to perform a full
34
 * transpose at the end of the function.
35
 */
36

    
37

    
38
#include <stdlib.h>                                      /* malloc(), free() */
39
#include <string.h>
40
#include "libavcodec/dsputil.h"
41

    
42
#include "gcc_fixes.h"
43

    
44
#include "dsputil_ppc.h"
45

    
46
#define vector_s16_t vector signed short
47
#define const_vector_s16_t const vector signed short
48
#define vector_u16_t vector unsigned short
49
#define vector_s8_t vector signed char
50
#define vector_u8_t vector unsigned char
51
#define vector_s32_t vector signed int
52
#define vector_u32_t vector unsigned int
53

    
54
#define IDCT_HALF                                       \
55
    /* 1st stage */                                     \
56
    t1 = vec_mradds (a1, vx7, vx1 );                    \
57
    t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7));    \
58
    t7 = vec_mradds (a2, vx5, vx3);                     \
59
    t3 = vec_mradds (ma2, vx3, vx5);                    \
60
                                                        \
61
    /* 2nd stage */                                     \
62
    t5 = vec_adds (vx0, vx4);                           \
63
    t0 = vec_subs (vx0, vx4);                           \
64
    t2 = vec_mradds (a0, vx6, vx2);                     \
65
    t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6));    \
66
    t6 = vec_adds (t8, t3);                             \
67
    t3 = vec_subs (t8, t3);                             \
68
    t8 = vec_subs (t1, t7);                             \
69
    t1 = vec_adds (t1, t7);                             \
70
                                                        \
71
    /* 3rd stage */                                     \
72
    t7 = vec_adds (t5, t2);                             \
73
    t2 = vec_subs (t5, t2);                             \
74
    t5 = vec_adds (t0, t4);                             \
75
    t0 = vec_subs (t0, t4);                             \
76
    t4 = vec_subs (t8, t3);                             \
77
    t3 = vec_adds (t8, t3);                             \
78
                                                        \
79
    /* 4th stage */                                     \
80
    vy0 = vec_adds (t7, t1);                            \
81
    vy7 = vec_subs (t7, t1);                            \
82
    vy1 = vec_mradds (c4, t3, t5);                      \
83
    vy6 = vec_mradds (mc4, t3, t5);                     \
84
    vy2 = vec_mradds (c4, t4, t0);                      \
85
    vy5 = vec_mradds (mc4, t4, t0);                     \
86
    vy3 = vec_adds (t2, t6);                            \
87
    vy4 = vec_subs (t2, t6);
88

    
89

    
90
#define IDCT                                                            \
91
    vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;                \
92
    vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;                \
93
    vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias;                  \
94
    vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8;                    \
95
    vector_u16_t shift;                                                 \
96
                                                                        \
97
    c4 = vec_splat (constants[0], 0);                                   \
98
    a0 = vec_splat (constants[0], 1);                                   \
99
    a1 = vec_splat (constants[0], 2);                                   \
100
    a2 = vec_splat (constants[0], 3);                                   \
101
    mc4 = vec_splat (constants[0], 4);                                  \
102
    ma2 = vec_splat (constants[0], 5);                                  \
103
    bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3);     \
104
                                                                        \
105
    zero = vec_splat_s16 (0);                                           \
106
    shift = vec_splat_u16 (4);                                          \
107
                                                                        \
108
    vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero);    \
109
    vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero);    \
110
    vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero);    \
111
    vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero);    \
112
    vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero);    \
113
    vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero);    \
114
    vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero);    \
115
    vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero);    \
116
                                                                        \
117
    IDCT_HALF                                                           \
118
                                                                        \
119
    vx0 = vec_mergeh (vy0, vy4);                                        \
120
    vx1 = vec_mergel (vy0, vy4);                                        \
121
    vx2 = vec_mergeh (vy1, vy5);                                        \
122
    vx3 = vec_mergel (vy1, vy5);                                        \
123
    vx4 = vec_mergeh (vy2, vy6);                                        \
124
    vx5 = vec_mergel (vy2, vy6);                                        \
125
    vx6 = vec_mergeh (vy3, vy7);                                        \
126
    vx7 = vec_mergel (vy3, vy7);                                        \
127
                                                                        \
128
    vy0 = vec_mergeh (vx0, vx4);                                        \
129
    vy1 = vec_mergel (vx0, vx4);                                        \
130
    vy2 = vec_mergeh (vx1, vx5);                                        \
131
    vy3 = vec_mergel (vx1, vx5);                                        \
132
    vy4 = vec_mergeh (vx2, vx6);                                        \
133
    vy5 = vec_mergel (vx2, vx6);                                        \
134
    vy6 = vec_mergeh (vx3, vx7);                                        \
135
    vy7 = vec_mergel (vx3, vx7);                                        \
136
                                                                        \
137
    vx0 = vec_adds (vec_mergeh (vy0, vy4), bias);                       \
138
    vx1 = vec_mergel (vy0, vy4);                                        \
139
    vx2 = vec_mergeh (vy1, vy5);                                        \
140
    vx3 = vec_mergel (vy1, vy5);                                        \
141
    vx4 = vec_mergeh (vy2, vy6);                                        \
142
    vx5 = vec_mergel (vy2, vy6);                                        \
143
    vx6 = vec_mergeh (vy3, vy7);                                        \
144
    vx7 = vec_mergel (vy3, vy7);                                        \
145
                                                                        \
146
    IDCT_HALF                                                           \
147
                                                                        \
148
    shift = vec_splat_u16 (6);                                          \
149
    vx0 = vec_sra (vy0, shift);                                         \
150
    vx1 = vec_sra (vy1, shift);                                         \
151
    vx2 = vec_sra (vy2, shift);                                         \
152
    vx3 = vec_sra (vy3, shift);                                         \
153
    vx4 = vec_sra (vy4, shift);                                         \
154
    vx5 = vec_sra (vy5, shift);                                         \
155
    vx6 = vec_sra (vy6, shift);                                         \
156
    vx7 = vec_sra (vy7, shift);
157

    
158

    
159
static const_vector_s16_t constants[5] = {
160
    AVV(23170, 13573, 6518, 21895, -23170, -21895, 32, 31),
161
    AVV(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725),
162
    AVV(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521),
163
    AVV(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692),
164
    AVV(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722)
165
};
166

    
167
void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block)
168
{
169
POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
170
    vector_u8_t tmp;
171

    
172
#ifdef CONFIG_POWERPC_PERF
173
POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
174
#endif
175
    IDCT
176

    
177
#define COPY(dest,src)                                          \
178
    tmp = vec_packsu (src, src);                                \
179
    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);       \
180
    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
181

    
182
    COPY (dest, vx0)    dest += stride;
183
    COPY (dest, vx1)    dest += stride;
184
    COPY (dest, vx2)    dest += stride;
185
    COPY (dest, vx3)    dest += stride;
186
    COPY (dest, vx4)    dest += stride;
187
    COPY (dest, vx5)    dest += stride;
188
    COPY (dest, vx6)    dest += stride;
189
    COPY (dest, vx7)
190

    
191
POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
192
}
193

    
194
void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block)
195
{
196
POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
197
    vector_u8_t tmp;
198
    vector_s16_t tmp2, tmp3;
199
    vector_u8_t perm0;
200
    vector_u8_t perm1;
201
    vector_u8_t p0, p1, p;
202

    
203
#ifdef CONFIG_POWERPC_PERF
204
POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
205
#endif
206

    
207
    IDCT
208

    
209
    p0 = vec_lvsl (0, dest);
210
    p1 = vec_lvsl (stride, dest);
211
    p = vec_splat_u8 (-1);
212
    perm0 = vec_mergeh (p, p0);
213
    perm1 = vec_mergeh (p, p1);
214

    
215
#define ADD(dest,src,perm)                                              \
216
    /* *(uint64_t *)&tmp = *(uint64_t *)dest; */                        \
217
    tmp = vec_ld (0, dest);                                             \
218
    tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm);       \
219
    tmp3 = vec_adds (tmp2, src);                                        \
220
    tmp = vec_packsu (tmp3, tmp3);                                      \
221
    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);               \
222
    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
223

    
224
    ADD (dest, vx0, perm0)      dest += stride;
225
    ADD (dest, vx1, perm1)      dest += stride;
226
    ADD (dest, vx2, perm0)      dest += stride;
227
    ADD (dest, vx3, perm1)      dest += stride;
228
    ADD (dest, vx4, perm0)      dest += stride;
229
    ADD (dest, vx5, perm1)      dest += stride;
230
    ADD (dest, vx6, perm0)      dest += stride;
231
    ADD (dest, vx7, perm1)
232

    
233
POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
234
}
235