Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / dsputil_ppc.c @ 7160bb71

History | View | Annotate | Download (6.42 KB)

1
/*
2
 * Copyright (c) 2002 Brian Foley
3
 * Copyright (c) 2002 Dieter Shirley
4
 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
22

    
23
#include "libavcodec/dsputil.h"
24
#include "dsputil_altivec.h"
25

    
26
int mm_support(void)
27
{
28
    int result = 0;
29
#if HAVE_ALTIVEC
30
    if (has_altivec()) {
31
        result |= AV_CPU_FLAG_ALTIVEC;
32
    }
33
#endif /* result */
34
    return result;
35
}
36

    
37
/* ***** WARNING ***** WARNING ***** WARNING ***** */
38
/*
39
clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
40
cache line size not equal to 32 bytes.
41
Fortunately all processor used by Apple up to at least the 7450 (aka second
42
generation G4) use 32 bytes cache line.
43
This is due to the use of the 'dcbz' instruction. It simply clear to zero a
44
single cache line, so you need to know the cache line size to use it !
45
It's absurd, but it's fast...
46

47
update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
48
size: 128 bytes. Oups.
49
The semantic of dcbz was changed, it always clear 32 bytes. so the function
50
below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
51
which is defined to clear a cache line (as dcbz before). So we still can
52
distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
53

54
see <http://developer.apple.com/technotes/tn/tn2087.html>
55
and <http://developer.apple.com/technotes/tn/tn2086.html>
56
*/
57
static void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
58
{
59
    register int misal = ((unsigned long)blocks & 0x00000010);
60
    register int i = 0;
61
#if 1
62
    if (misal) {
63
        ((unsigned long*)blocks)[0] = 0L;
64
        ((unsigned long*)blocks)[1] = 0L;
65
        ((unsigned long*)blocks)[2] = 0L;
66
        ((unsigned long*)blocks)[3] = 0L;
67
        i += 16;
68
    }
69
    for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
70
        __asm__ volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
71
    }
72
    if (misal) {
73
        ((unsigned long*)blocks)[188] = 0L;
74
        ((unsigned long*)blocks)[189] = 0L;
75
        ((unsigned long*)blocks)[190] = 0L;
76
        ((unsigned long*)blocks)[191] = 0L;
77
        i += 16;
78
    }
79
#else
80
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
81
#endif
82
}
83

    
84
/* same as above, when dcbzl clear a whole 128B cache line
85
   i.e. the PPC970 aka G5 */
86
#if HAVE_DCBZL
87
static void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
88
{
89
    register int misal = ((unsigned long)blocks & 0x0000007f);
90
    register int i = 0;
91
#if 1
92
    if (misal) {
93
        // we could probably also optimize this case,
94
        // but there's not much point as the machines
95
        // aren't available yet (2003-06-26)
96
        memset(blocks, 0, sizeof(DCTELEM)*6*64);
97
    }
98
    else
99
        for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
100
            __asm__ volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
101
        }
102
#else
103
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
104
#endif
105
}
106
#else
107
static void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
108
{
109
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
110
}
111
#endif
112

    
113
#if HAVE_DCBZL
114
/* check dcbz report how many bytes are set to 0 by dcbz */
115
/* update 24/06/2003 : replace dcbz by dcbzl to get
116
   the intended effect (Apple "fixed" dcbz)
117
   unfortunately this cannot be used unless the assembler
118
   knows about dcbzl ... */
119
static long check_dcbzl_effect(void)
120
{
121
    register char *fakedata = av_malloc(1024);
122
    register char *fakedata_middle;
123
    register long zero = 0;
124
    register long i = 0;
125
    long count = 0;
126

    
127
    if (!fakedata) {
128
        return 0L;
129
    }
130

    
131
    fakedata_middle = (fakedata + 512);
132

    
133
    memset(fakedata, 0xFF, 1024);
134

    
135
    /* below the constraint "b" seems to mean "Address base register"
136
       in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
137
    __asm__ volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
138

    
139
    for (i = 0; i < 1024 ; i ++) {
140
        if (fakedata[i] == (char)0)
141
            count++;
142
    }
143

    
144
    av_free(fakedata);
145

    
146
    return count;
147
}
148
#else
149
static long check_dcbzl_effect(void)
150
{
151
  return 0;
152
}
153
#endif
154

    
155
static void prefetch_ppc(void *mem, int stride, int h)
156
{
157
    register const uint8_t *p = mem;
158
    do {
159
        __asm__ volatile ("dcbt 0,%0" : : "r" (p));
160
        p+= stride;
161
    } while(--h);
162
}
163

    
164
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
165
{
166
    // Common optimizations whether AltiVec is available or not
167
    c->prefetch = prefetch_ppc;
168
    switch (check_dcbzl_effect()) {
169
        case 32:
170
            c->clear_blocks = clear_blocks_dcbz32_ppc;
171
            break;
172
        case 128:
173
            c->clear_blocks = clear_blocks_dcbz128_ppc;
174
            break;
175
        default:
176
            break;
177
    }
178

    
179
#if HAVE_ALTIVEC
180
    if(CONFIG_H264_DECODER) dsputil_h264_init_ppc(c, avctx);
181

    
182
    if (has_altivec()) {
183
        dsputil_init_altivec(c, avctx);
184
        if(CONFIG_VC1_DECODER)
185
            vc1dsp_init_altivec(c, avctx);
186
        float_init_altivec(c, avctx);
187
        int_init_altivec(c, avctx);
188
        c->gmc1 = gmc1_altivec;
189

    
190
#if CONFIG_ENCODERS
191
        if (avctx->dct_algo == FF_DCT_AUTO ||
192
            avctx->dct_algo == FF_DCT_ALTIVEC) {
193
            c->fdct = fdct_altivec;
194
        }
195
#endif //CONFIG_ENCODERS
196

    
197
        if (avctx->lowres==0) {
198
            if ((avctx->idct_algo == FF_IDCT_AUTO) ||
199
                (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
200
                c->idct_put = idct_put_altivec;
201
                c->idct_add = idct_add_altivec;
202
                c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
203
            }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
204
                     avctx->idct_algo==FF_IDCT_VP3){
205
                c->idct_put = ff_vp3_idct_put_altivec;
206
                c->idct_add = ff_vp3_idct_add_altivec;
207
                c->idct     = ff_vp3_idct_altivec;
208
                c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
209
            }
210
        }
211

    
212
    }
213
#endif /* HAVE_ALTIVEC */
214
}