Revision 481fb636

View differences:

libavcodec/Makefile
343 343
                                          ppc/gmc_altivec.o          \
344 344
                                          ppc/fdct_altivec.o         \
345 345
                                          ppc/dsputil_h264_altivec.o \
346
                                          ppc/dsputil_snow_altivec.o
346
                                          ppc/dsputil_snow_altivec.o \
347
                                          ppc/vc1dsp_altivec.o
347 348

  
348 349
CFLAGS += $(CFLAGS-yes)
349 350
OBJS += $(OBJS-yes)
libavcodec/ppc/dsputil_ppc.c
251 251
}
252 252
#endif
253 253

  
254
#ifdef HAVE_ALTIVEC
255
void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx);
256
#endif
257

  
254 258
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
255 259
{
256 260
    // Common optimizations whether Altivec is available or not
......
310 314
        c->vertical_compose97i = ff_snow_vertical_compose97i_altivec;
311 315
        c->inner_add_yblock = ff_snow_inner_add_yblock_altivec;
312 316

  
317
        vc1dsp_init_altivec(c, avctx);
313 318
#ifdef CONFIG_ENCODERS
314 319
        if (avctx->dct_algo == FF_DCT_AUTO ||
315 320
            avctx->dct_algo == FF_DCT_ALTIVEC)
libavcodec/ppc/vc1dsp_altivec.c
1
/*
2
 * VC-1 and WMV3 decoder - DSP functions AltiVec-optimized
3
 * Copyright (c) 2006 Konstantin Shishkov
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
 *
19
 */
20

  
21
#include "../dsputil.h"
22

  
23
#include "gcc_fixes.h"
24

  
25
#include "dsputil_altivec.h"
26

  
27
// Transpose 8x8 matrix of 16-bit elements. Borrowed from mpegvideo_altivec.c
28
#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
29
do { \
30
    vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \
31
    vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \
32
 \
33
    A1 = vec_mergeh (a, e); \
34
    B1 = vec_mergel (a, e); \
35
    C1 = vec_mergeh (b, f); \
36
    D1 = vec_mergel (b, f); \
37
    E1 = vec_mergeh (c, g); \
38
    F1 = vec_mergel (c, g); \
39
    G1 = vec_mergeh (d, h); \
40
    H1 = vec_mergel (d, h); \
41
 \
42
    A2 = vec_mergeh (A1, E1); \
43
    B2 = vec_mergel (A1, E1); \
44
    C2 = vec_mergeh (B1, F1); \
45
    D2 = vec_mergel (B1, F1); \
46
    E2 = vec_mergeh (C1, G1); \
47
    F2 = vec_mergel (C1, G1); \
48
    G2 = vec_mergeh (D1, H1); \
49
    H2 = vec_mergel (D1, H1); \
50
 \
51
    a = vec_mergeh (A2, E2); \
52
    b = vec_mergel (A2, E2); \
53
    c = vec_mergeh (B2, F2); \
54
    d = vec_mergel (B2, F2); \
55
    e = vec_mergeh (C2, G2); \
56
    f = vec_mergel (C2, G2); \
57
    g = vec_mergeh (D2, H2); \
58
    h = vec_mergel (D2, H2); \
59
} while (0)
60

  
61
// main steps of 8x8 transform
62
#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
63
do { \
64
    t0 = vec_sl(vec_add(s0, s4), vec_2); \
65
    t0 = vec_add(vec_sl(t0, vec_1), t0); \
66
    t0 = vec_add(t0, vec_rnd); \
67
    t1 = vec_sl(vec_sub(s0, s4), vec_2); \
68
    t1 = vec_add(vec_sl(t1, vec_1), t1); \
69
    t1 = vec_add(t1, vec_rnd); \
70
    t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \
71
    t2 = vec_add(t2, vec_sl(s2, vec_4)); \
72
    t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \
73
    t3 = vec_sub(t3, vec_sl(s6, vec_4)); \
74
    t4 = vec_add(t0, t2); \
75
    t5 = vec_add(t1, t3); \
76
    t6 = vec_sub(t1, t3); \
77
    t7 = vec_sub(t0, t2); \
78
\
79
    t0 = vec_sl(vec_add(s1, s3), vec_4); \
80
    t0 = vec_add(t0, vec_sl(s5, vec_3)); \
81
    t0 = vec_add(t0, vec_sl(s7, vec_2)); \
82
    t0 = vec_add(t0, vec_sub(s5, s3)); \
83
\
84
    t1 = vec_sl(vec_sub(s1, s5), vec_4); \
85
    t1 = vec_sub(t1, vec_sl(s7, vec_3)); \
86
    t1 = vec_sub(t1, vec_sl(s3, vec_2)); \
87
    t1 = vec_sub(t1, vec_add(s1, s7)); \
88
\
89
    t2 = vec_sl(vec_sub(s7, s3), vec_4); \
90
    t2 = vec_add(t2, vec_sl(s1, vec_3)); \
91
    t2 = vec_add(t2, vec_sl(s5, vec_2)); \
92
    t2 = vec_add(t2, vec_sub(s1, s7)); \
93
\
94
    t3 = vec_sl(vec_sub(s5, s7), vec_4); \
95
    t3 = vec_sub(t3, vec_sl(s3, vec_3)); \
96
    t3 = vec_add(t3, vec_sl(s1, vec_2)); \
97
    t3 = vec_sub(t3, vec_add(s3, s5)); \
98
\
99
    s0 = vec_add(t4, t0); \
100
    s1 = vec_add(t5, t1); \
101
    s2 = vec_add(t6, t2); \
102
    s3 = vec_add(t7, t3); \
103
    s4 = vec_sub(t7, t3); \
104
    s5 = vec_sub(t6, t2); \
105
    s6 = vec_sub(t5, t1); \
106
    s7 = vec_sub(t4, t0); \
107
}while(0)
108

  
109
#define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \
110
do { \
111
    s0 = vec_sra(s0, vec_3); \
112
    s1 = vec_sra(s1, vec_3); \
113
    s2 = vec_sra(s2, vec_3); \
114
    s3 = vec_sra(s3, vec_3); \
115
    s4 = vec_sra(s4, vec_3); \
116
    s5 = vec_sra(s5, vec_3); \
117
    s6 = vec_sra(s6, vec_3); \
118
    s7 = vec_sra(s7, vec_3); \
119
}while(0)
120

  
121
#define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \
122
do { \
123
    s0 = vec_sra(s0, vec_7); \
124
    s1 = vec_sra(s1, vec_7); \
125
    s2 = vec_sra(s2, vec_7); \
126
    s3 = vec_sra(s3, vec_7); \
127
    s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \
128
    s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \
129
    s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \
130
    s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \
131
}while(0)
132

  
133
/* main steps of 4x4 transform */
134
#define STEP4(s0, s1, s2, s3, vec_rnd) \
135
do { \
136
    t1 = vec_add(vec_sl(s0, vec_4), s0); \
137
    t1 = vec_add(t1, vec_rnd); \
138
    t2 = vec_add(vec_sl(s2, vec_4), s2); \
139
    t0 = vec_add(t1, t2); \
140
    t1 = vec_sub(t1, t2); \
141
    t3 = vec_sl(vec_sub(s3, s1), vec_1); \
142
    t3 = vec_add(t3, vec_sl(t3, vec_2)); \
143
    t2 = vec_add(t3, vec_sl(s1, vec_5)); \
144
    t3 = vec_add(t3, vec_sl(s3, vec_3)); \
145
    t3 = vec_add(t3, vec_sl(s3, vec_2)); \
146
    s0 = vec_add(t0, t2); \
147
    s1 = vec_sub(t1, t3); \
148
    s2 = vec_add(t1, t3); \
149
    s3 = vec_sub(t0, t2); \
150
}while (0)
151

  
152
#define SHIFT_HOR4(s0, s1, s2, s3) \
153
    s0 = vec_sra(s0, vec_3); \
154
    s1 = vec_sra(s1, vec_3); \
155
    s2 = vec_sra(s2, vec_3); \
156
    s3 = vec_sra(s3, vec_3);
157

  
158
#define SHIFT_VERT4(s0, s1, s2, s3) \
159
    s0 = vec_sra(s0, vec_7); \
160
    s1 = vec_sra(s1, vec_7); \
161
    s2 = vec_sra(s2, vec_7); \
162
    s3 = vec_sra(s3, vec_7);
163

  
164
/** Do inverse transform on 8x8 block
165
*/
166
static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
167
{
168
    vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
169
    vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
170
    vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
171
    vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
172
    const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
173
    const vector unsigned int vec_7 = vec_splat_u32(7);
174
    const vector unsigned int vec_5 = vec_splat_u32(5);
175
    const vector unsigned int vec_4 = vec_splat_u32(4);
176
    const vector  signed int vec_4s = vec_splat_s32(4);
177
    const vector unsigned int vec_3 = vec_splat_u32(3);
178
    const vector unsigned int vec_2 = vec_splat_u32(2);
179
    const vector  signed int vec_1s = vec_splat_s32(1);
180
    const vector unsigned int vec_1 = vec_splat_u32(1);
181

  
182

  
183
    src0 = vec_ld(  0, block);
184
    src1 = vec_ld( 16, block);
185
    src2 = vec_ld( 32, block);
186
    src3 = vec_ld( 48, block);
187
    src4 = vec_ld( 64, block);
188
    src5 = vec_ld( 80, block);
189
    src6 = vec_ld( 96, block);
190
    src7 = vec_ld(112, block);
191

  
192
    TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
193
    s0 = vec_unpackl(src0);
194
    s1 = vec_unpackl(src1);
195
    s2 = vec_unpackl(src2);
196
    s3 = vec_unpackl(src3);
197
    s4 = vec_unpackl(src4);
198
    s5 = vec_unpackl(src5);
199
    s6 = vec_unpackl(src6);
200
    s7 = vec_unpackl(src7);
201
    s8 = vec_unpackh(src0);
202
    s9 = vec_unpackh(src1);
203
    sA = vec_unpackh(src2);
204
    sB = vec_unpackh(src3);
205
    sC = vec_unpackh(src4);
206
    sD = vec_unpackh(src5);
207
    sE = vec_unpackh(src6);
208
    sF = vec_unpackh(src7);
209
    STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
210
    SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
211
    STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
212
    SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
213
    src0 = vec_pack(s8, s0);
214
    src1 = vec_pack(s9, s1);
215
    src2 = vec_pack(sA, s2);
216
    src3 = vec_pack(sB, s3);
217
    src4 = vec_pack(sC, s4);
218
    src5 = vec_pack(sD, s5);
219
    src6 = vec_pack(sE, s6);
220
    src7 = vec_pack(sF, s7);
221
    TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
222

  
223
    s0 = vec_unpackl(src0);
224
    s1 = vec_unpackl(src1);
225
    s2 = vec_unpackl(src2);
226
    s3 = vec_unpackl(src3);
227
    s4 = vec_unpackl(src4);
228
    s5 = vec_unpackl(src5);
229
    s6 = vec_unpackl(src6);
230
    s7 = vec_unpackl(src7);
231
    s8 = vec_unpackh(src0);
232
    s9 = vec_unpackh(src1);
233
    sA = vec_unpackh(src2);
234
    sB = vec_unpackh(src3);
235
    sC = vec_unpackh(src4);
236
    sD = vec_unpackh(src5);
237
    sE = vec_unpackh(src6);
238
    sF = vec_unpackh(src7);
239
    STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64);
240
    SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7);
241
    STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64);
242
    SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF);
243
    src0 = vec_pack(s8, s0);
244
    src1 = vec_pack(s9, s1);
245
    src2 = vec_pack(sA, s2);
246
    src3 = vec_pack(sB, s3);
247
    src4 = vec_pack(sC, s4);
248
    src5 = vec_pack(sD, s5);
249
    src6 = vec_pack(sE, s6);
250
    src7 = vec_pack(sF, s7);
251

  
252
    vec_st(src0,  0, block);
253
    vec_st(src1, 16, block);
254
    vec_st(src2, 32, block);
255
    vec_st(src3, 48, block);
256
    vec_st(src4, 64, block);
257
    vec_st(src5, 80, block);
258
    vec_st(src6, 96, block);
259
    vec_st(src7,112, block);
260
}
261

  
262
/** Do inverse transform on 8x4 part of block
263
*/
264
static void vc1_inv_trans_8x4_altivec(DCTELEM block[64], int n)
265
{
266
    vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
267
    vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
268
    vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
269
    vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
270
    const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
271
    const vector unsigned int vec_7 = vec_splat_u32(7);
272
    const vector unsigned int vec_5 = vec_splat_u32(5);
273
    const vector unsigned int vec_4 = vec_splat_u32(4);
274
    const vector  signed int vec_4s = vec_splat_s32(4);
275
    const vector unsigned int vec_3 = vec_splat_u32(3);
276
    const vector unsigned int vec_2 = vec_splat_u32(2);
277
    const vector unsigned int vec_1 = vec_splat_u32(1);
278

  
279
    src0 = vec_ld(  0, block);
280
    src1 = vec_ld( 16, block);
281
    src2 = vec_ld( 32, block);
282
    src3 = vec_ld( 48, block);
283
    src4 = vec_ld( 64, block);
284
    src5 = vec_ld( 80, block);
285
    src6 = vec_ld( 96, block);
286
    src7 = vec_ld(112, block);
287

  
288
    TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
289
    s0 = vec_unpackl(src0);
290
    s1 = vec_unpackl(src1);
291
    s2 = vec_unpackl(src2);
292
    s3 = vec_unpackl(src3);
293
    s4 = vec_unpackl(src4);
294
    s5 = vec_unpackl(src5);
295
    s6 = vec_unpackl(src6);
296
    s7 = vec_unpackl(src7);
297
    s8 = vec_unpackh(src0);
298
    s9 = vec_unpackh(src1);
299
    sA = vec_unpackh(src2);
300
    sB = vec_unpackh(src3);
301
    sC = vec_unpackh(src4);
302
    sD = vec_unpackh(src5);
303
    sE = vec_unpackh(src6);
304
    sF = vec_unpackh(src7);
305
    STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
306
    SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
307
    STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
308
    SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
309
    src0 = vec_pack(s8, s0);
310
    src1 = vec_pack(s9, s1);
311
    src2 = vec_pack(sA, s2);
312
    src3 = vec_pack(sB, s3);
313
    src4 = vec_pack(sC, s4);
314
    src5 = vec_pack(sD, s5);
315
    src6 = vec_pack(sE, s6);
316
    src7 = vec_pack(sF, s7);
317
    TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
318

  
319
    if(!n){ // upper half of block
320
        s0 = vec_unpackh(src0);
321
        s1 = vec_unpackh(src1);
322
        s2 = vec_unpackh(src2);
323
        s3 = vec_unpackh(src3);
324
        s8 = vec_unpackl(src0);
325
        s9 = vec_unpackl(src1);
326
        sA = vec_unpackl(src2);
327
        sB = vec_unpackl(src3);
328
        STEP4(s0, s1, s2, s3, vec_64);
329
        SHIFT_VERT4(s0, s1, s2, s3);
330
        STEP4(s8, s9, sA, sB, vec_64);
331
        SHIFT_VERT4(s8, s9, sA, sB);
332
        src0 = vec_pack(s0, s8);
333
        src1 = vec_pack(s1, s9);
334
        src2 = vec_pack(s2, sA);
335
        src3 = vec_pack(s3, sB);
336

  
337
        vec_st(src0,  0, block);
338
        vec_st(src1, 16, block);
339
        vec_st(src2, 32, block);
340
        vec_st(src3, 48, block);
341
    } else { //lower half of block
342
        s0 = vec_unpackh(src4);
343
        s1 = vec_unpackh(src5);
344
        s2 = vec_unpackh(src6);
345
        s3 = vec_unpackh(src7);
346
        s8 = vec_unpackl(src4);
347
        s9 = vec_unpackl(src5);
348
        sA = vec_unpackl(src6);
349
        sB = vec_unpackl(src7);
350
        STEP4(s0, s1, s2, s3, vec_64);
351
        SHIFT_VERT4(s0, s1, s2, s3);
352
        STEP4(s8, s9, sA, sB, vec_64);
353
        SHIFT_VERT4(s8, s9, sA, sB);
354
        src4 = vec_pack(s0, s8);
355
        src5 = vec_pack(s1, s9);
356
        src6 = vec_pack(s2, sA);
357
        src7 = vec_pack(s3, sB);
358

  
359
        vec_st(src4, 64, block);
360
        vec_st(src5, 80, block);
361
        vec_st(src6, 96, block);
362
        vec_st(src7,112, block);
363
    }
364
}
365

  
366

  
367
void vc1dsp_init_altivec(DSPContext* dsp, AVCodecContext *avctx) {
368
    dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec;
369
    dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec;
370
}

Also available in: Unified diff