Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / dsputil_ppc.c @ 454403ba

History | View | Annotate | Download (10.2 KB)

1
/*
2
 * Copyright (c) 2002 Brian Foley
3
 * Copyright (c) 2002 Dieter Shirley
4
 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
22

    
23
#include "libavcodec/dsputil.h"
24

    
25
#include "dsputil_ppc.h"
26

    
27
#include "dsputil_altivec.h"
28

    
29
void fdct_altivec(int16_t *block);
30
void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h,
31
                  int x16, int y16, int rounder);
32
void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
33
void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
34

    
35
void ff_vp3_idct_altivec(DCTELEM *block);
36
void ff_vp3_idct_put_altivec(uint8_t *dest, int line_size, DCTELEM *block);
37
void ff_vp3_idct_add_altivec(uint8_t *dest, int line_size, DCTELEM *block);
38

    
39
void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx);
40

    
41
void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx);
42
void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx);
43
void float_init_altivec(DSPContext* c, AVCodecContext *avctx);
44
void int_init_altivec(DSPContext* c, AVCodecContext *avctx);
45

    
46
int mm_flags = 0;
47

    
48
int mm_support(void)
49
{
50
    int result = 0;
51
#if HAVE_ALTIVEC
52
    if (has_altivec()) {
53
        result |= FF_MM_ALTIVEC;
54
    }
55
#endif /* result */
56
    return result;
57
}
58

    
59
#if CONFIG_POWERPC_PERF
60
unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
61
/* list below must match enum in dsputil_ppc.h */
62
static unsigned char* perfname[] = {
63
    "ff_fft_calc_altivec",
64
    "gmc1_altivec",
65
    "dct_unquantize_h263_altivec",
66
    "fdct_altivec",
67
    "idct_add_altivec",
68
    "idct_put_altivec",
69
    "put_pixels16_altivec",
70
    "avg_pixels16_altivec",
71
    "avg_pixels8_altivec",
72
    "put_pixels8_xy2_altivec",
73
    "put_no_rnd_pixels8_xy2_altivec",
74
    "put_pixels16_xy2_altivec",
75
    "put_no_rnd_pixels16_xy2_altivec",
76
    "hadamard8_diff8x8_altivec",
77
    "hadamard8_diff16_altivec",
78
    "avg_pixels8_xy2_altivec",
79
    "clear_blocks_dcbz32_ppc",
80
    "clear_blocks_dcbz128_ppc",
81
    "put_h264_chroma_mc8_altivec",
82
    "avg_h264_chroma_mc8_altivec",
83
    "put_h264_qpel16_h_lowpass_altivec",
84
    "avg_h264_qpel16_h_lowpass_altivec",
85
    "put_h264_qpel16_v_lowpass_altivec",
86
    "avg_h264_qpel16_v_lowpass_altivec",
87
    "put_h264_qpel16_hv_lowpass_altivec",
88
    "avg_h264_qpel16_hv_lowpass_altivec",
89
    ""
90
};
91
#include <stdio.h>
92
#endif
93

    
94
#if CONFIG_POWERPC_PERF
95
void powerpc_display_perf_report(void)
96
{
97
    int i, j;
98
    av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
99
    for(i = 0 ; i < powerpc_perf_total ; i++) {
100
        for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) {
101
            if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
102
                av_log(NULL, AV_LOG_INFO,
103
                       " Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n",
104
                       perfname[i],
105
                       j+1,
106
                       perfdata[j][i][powerpc_data_min],
107
                       perfdata[j][i][powerpc_data_max],
108
                       (double)perfdata[j][i][powerpc_data_sum] /
109
                       (double)perfdata[j][i][powerpc_data_num],
110
                       perfdata[j][i][powerpc_data_num]);
111
        }
112
    }
113
}
114
#endif /* CONFIG_POWERPC_PERF */
115

    
116
/* ***** WARNING ***** WARNING ***** WARNING ***** */
117
/*
118
clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
119
cache line size not equal to 32 bytes.
120
Fortunately all processor used by Apple up to at least the 7450 (aka second
121
generation G4) use 32 bytes cache line.
122
This is due to the use of the 'dcbz' instruction. It simply clear to zero a
123
single cache line, so you need to know the cache line size to use it !
124
It's absurd, but it's fast...
125

126
update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
127
size: 128 bytes. Oups.
128
The semantic of dcbz was changed, it always clear 32 bytes. so the function
129
below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
130
which is defined to clear a cache line (as dcbz before). So we still can
131
distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
132

133
see <http://developer.apple.com/technotes/tn/tn2087.html>
134
and <http://developer.apple.com/technotes/tn/tn2086.html>
135
*/
136
void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
137
{
138
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
139
    register int misal = ((unsigned long)blocks & 0x00000010);
140
    register int i = 0;
141
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
142
#if 1
143
    if (misal) {
144
        ((unsigned long*)blocks)[0] = 0L;
145
        ((unsigned long*)blocks)[1] = 0L;
146
        ((unsigned long*)blocks)[2] = 0L;
147
        ((unsigned long*)blocks)[3] = 0L;
148
        i += 16;
149
    }
150
    for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
151
        __asm__ volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
152
    }
153
    if (misal) {
154
        ((unsigned long*)blocks)[188] = 0L;
155
        ((unsigned long*)blocks)[189] = 0L;
156
        ((unsigned long*)blocks)[190] = 0L;
157
        ((unsigned long*)blocks)[191] = 0L;
158
        i += 16;
159
    }
160
#else
161
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
162
#endif
163
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
164
}
165

    
166
/* same as above, when dcbzl clear a whole 128B cache line
167
   i.e. the PPC970 aka G5 */
168
#if HAVE_DCBZL
169
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
170
{
171
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
172
    register int misal = ((unsigned long)blocks & 0x0000007f);
173
    register int i = 0;
174
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
175
#if 1
176
    if (misal) {
177
        // we could probably also optimize this case,
178
        // but there's not much point as the machines
179
        // aren't available yet (2003-06-26)
180
        memset(blocks, 0, sizeof(DCTELEM)*6*64);
181
    }
182
    else
183
        for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
184
            __asm__ volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
185
        }
186
#else
187
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
188
#endif
189
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
190
}
191
#else
192
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
193
{
194
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
195
}
196
#endif
197

    
198
#if HAVE_DCBZL
199
/* check dcbz report how many bytes are set to 0 by dcbz */
200
/* update 24/06/2003 : replace dcbz by dcbzl to get
201
   the intended effect (Apple "fixed" dcbz)
202
   unfortunately this cannot be used unless the assembler
203
   knows about dcbzl ... */
204
long check_dcbzl_effect(void)
205
{
206
    register char *fakedata = av_malloc(1024);
207
    register char *fakedata_middle;
208
    register long zero = 0;
209
    register long i = 0;
210
    long count = 0;
211

    
212
    if (!fakedata) {
213
        return 0L;
214
    }
215

    
216
    fakedata_middle = (fakedata + 512);
217

    
218
    memset(fakedata, 0xFF, 1024);
219

    
220
    /* below the constraint "b" seems to mean "Address base register"
221
       in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
222
    __asm__ volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
223

    
224
    for (i = 0; i < 1024 ; i ++) {
225
        if (fakedata[i] == (char)0)
226
            count++;
227
    }
228

    
229
    av_free(fakedata);
230

    
231
    return count;
232
}
233
#else
234
long check_dcbzl_effect(void)
235
{
236
  return 0;
237
}
238
#endif
239

    
240
static void prefetch_ppc(void *mem, int stride, int h)
241
{
242
    register const uint8_t *p = mem;
243
    do {
244
        __asm__ volatile ("dcbt 0,%0" : : "r" (p));
245
        p+= stride;
246
    } while(--h);
247
}
248

    
249
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
250
{
251
    // Common optimizations whether AltiVec is available or not
252
    c->prefetch = prefetch_ppc;
253
    switch (check_dcbzl_effect()) {
254
        case 32:
255
            c->clear_blocks = clear_blocks_dcbz32_ppc;
256
            break;
257
        case 128:
258
            c->clear_blocks = clear_blocks_dcbz128_ppc;
259
            break;
260
        default:
261
            break;
262
    }
263

    
264
#if HAVE_ALTIVEC
265
    if(CONFIG_H264_DECODER) dsputil_h264_init_ppc(c, avctx);
266

    
267
    if (has_altivec()) {
268
        mm_flags |= FF_MM_ALTIVEC;
269

    
270
        dsputil_init_altivec(c, avctx);
271
        if(CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER)
272
            vc1dsp_init_altivec(c, avctx);
273
        float_init_altivec(c, avctx);
274
        int_init_altivec(c, avctx);
275
        c->gmc1 = gmc1_altivec;
276

    
277
#if CONFIG_ENCODERS
278
        if (avctx->dct_algo == FF_DCT_AUTO ||
279
            avctx->dct_algo == FF_DCT_ALTIVEC) {
280
            c->fdct = fdct_altivec;
281
        }
282
#endif //CONFIG_ENCODERS
283

    
284
        if (avctx->lowres==0) {
285
            if ((avctx->idct_algo == FF_IDCT_AUTO) ||
286
                (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
287
                c->idct_put = idct_put_altivec;
288
                c->idct_add = idct_add_altivec;
289
                c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
290
            }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER || CONFIG_THEORA_DECODER) &&
291
                     avctx->idct_algo==FF_IDCT_VP3){
292
                c->idct_put = ff_vp3_idct_put_altivec;
293
                c->idct_add = ff_vp3_idct_add_altivec;
294
                c->idct     = ff_vp3_idct_altivec;
295
                c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
296
            }
297
        }
298

    
299
#if CONFIG_POWERPC_PERF
300
        {
301
            int i, j;
302
            for (i = 0 ; i < powerpc_perf_total ; i++) {
303
                for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) {
304
                    perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL;
305
                    perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL;
306
                    perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL;
307
                    perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL;
308
                }
309
            }
310
        }
311
#endif /* CONFIG_POWERPC_PERF */
312
    }
313
#endif /* HAVE_ALTIVEC */
314
}