Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / dsputil_ppc.c @ ccf22d3e

History | View | Annotate | Download (6.27 KB)

1 05c4072b Michael Niedermayer
/*
2
 * Copyright (c) 2002 Brian Foley
3
 * Copyright (c) 2002 Dieter Shirley
4 c4a17148 Michael Niedermayer
 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5 05c4072b Michael Niedermayer
 *
6 b78e7197 Diego Biurrun
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9 05c4072b Michael Niedermayer
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11 b78e7197 Diego Biurrun
 * version 2.1 of the License, or (at your option) any later version.
12 05c4072b Michael Niedermayer
 *
13 b78e7197 Diego Biurrun
 * FFmpeg is distributed in the hope that it will be useful,
14 05c4072b Michael Niedermayer
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19 b78e7197 Diego Biurrun
 * License along with FFmpeg; if not, write to the Free Software
20 5509bffa Diego Biurrun
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 05c4072b Michael Niedermayer
 */
22
23 245976da Diego Biurrun
#include "libavcodec/dsputil.h"
24 ab6c65f6 Brian Foley
#include "dsputil_altivec.h"
25
26 35e5fb06 Romain Dolbeau
/* ***** WARNING ***** WARNING ***** WARNING ***** */
27
/*
28 e3905ce0 Diego Biurrun
clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
29
cache line size not equal to 32 bytes.
30
Fortunately all processor used by Apple up to at least the 7450 (aka second
31
generation G4) use 32 bytes cache line.
32
This is due to the use of the 'dcbz' instruction. It simply clear to zero a
33
single cache line, so you need to know the cache line size to use it !
34
It's absurd, but it's fast...
35 a4adb608 Michael Niedermayer

36 e3905ce0 Diego Biurrun
update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
37
size: 128 bytes. Oups.
38
The semantic of dcbz was changed, it always clear 32 bytes. so the function
39
below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
40
which is defined to clear a cache line (as dcbz before). So we still can
41
distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
42 a4adb608 Michael Niedermayer

43 e3905ce0 Diego Biurrun
see <http://developer.apple.com/technotes/tn/tn2087.html>
44
and <http://developer.apple.com/technotes/tn/tn2086.html>
45 35e5fb06 Romain Dolbeau
*/
46 ddb8c2c0 Måns Rullgård
static void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
47 35e5fb06 Romain Dolbeau
{
48
    register int misal = ((unsigned long)blocks & 0x00000010);
49
    register int i = 0;
50
#if 1
51
    if (misal) {
52 e3905ce0 Diego Biurrun
        ((unsigned long*)blocks)[0] = 0L;
53
        ((unsigned long*)blocks)[1] = 0L;
54
        ((unsigned long*)blocks)[2] = 0L;
55
        ((unsigned long*)blocks)[3] = 0L;
56
        i += 16;
57 35e5fb06 Romain Dolbeau
    }
58 b1d041c1 Roine Gustafsson
    for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
59 be449fca Diego Pettenò
        __asm__ volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
60 35e5fb06 Romain Dolbeau
    }
61
    if (misal) {
62 e3905ce0 Diego Biurrun
        ((unsigned long*)blocks)[188] = 0L;
63
        ((unsigned long*)blocks)[189] = 0L;
64
        ((unsigned long*)blocks)[190] = 0L;
65
        ((unsigned long*)blocks)[191] = 0L;
66
        i += 16;
67 35e5fb06 Romain Dolbeau
    }
68
#else
69
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
70
#endif
71
}
72
73 a4adb608 Michael Niedermayer
/* same as above, when dcbzl clear a whole 128B cache line
74
   i.e. the PPC970 aka G5 */
75 b250f9c6 Aurelien Jacobs
#if HAVE_DCBZL
76 ddb8c2c0 Måns Rullgård
static void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
77 a4adb608 Michael Niedermayer
{
78
    register int misal = ((unsigned long)blocks & 0x0000007f);
79
    register int i = 0;
80
#if 1
81 e3905ce0 Diego Biurrun
    if (misal) {
82
        // we could probably also optimize this case,
83
        // but there's not much point as the machines
84
        // aren't available yet (2003-06-26)
85
        memset(blocks, 0, sizeof(DCTELEM)*6*64);
86 a4adb608 Michael Niedermayer
    }
87
    else
88 e3905ce0 Diego Biurrun
        for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
89 be449fca Diego Pettenò
            __asm__ volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
90 e3905ce0 Diego Biurrun
        }
91 a4adb608 Michael Niedermayer
#else
92
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
93
#endif
94
}
95
#else
96 ddb8c2c0 Måns Rullgård
static void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
97 a4adb608 Michael Niedermayer
{
98 e3905ce0 Diego Biurrun
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
99 a4adb608 Michael Niedermayer
}
100
#endif
101
102 b250f9c6 Aurelien Jacobs
#if HAVE_DCBZL
103 35e5fb06 Romain Dolbeau
/* check dcbz report how many bytes are set to 0 by dcbz */
104 a4adb608 Michael Niedermayer
/* update 24/06/2003 : replace dcbz by dcbzl to get
105
   the intended effect (Apple "fixed" dcbz)
106
   unfortunately this cannot be used unless the assembler
107
   knows about dcbzl ... */
108 ddb8c2c0 Måns Rullgård
static long check_dcbzl_effect(void)
109 35e5fb06 Romain Dolbeau
{
110 e3905ce0 Diego Biurrun
    register char *fakedata = av_malloc(1024);
111
    register char *fakedata_middle;
112
    register long zero = 0;
113
    register long i = 0;
114
    long count = 0;
115 35e5fb06 Romain Dolbeau
116 e3905ce0 Diego Biurrun
    if (!fakedata) {
117
        return 0L;
118
    }
119 35e5fb06 Romain Dolbeau
120 e3905ce0 Diego Biurrun
    fakedata_middle = (fakedata + 512);
121 35e5fb06 Romain Dolbeau
122 e3905ce0 Diego Biurrun
    memset(fakedata, 0xFF, 1024);
123 35e5fb06 Romain Dolbeau
124 e3905ce0 Diego Biurrun
    /* below the constraint "b" seems to mean "Address base register"
125
       in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
126 be449fca Diego Pettenò
    __asm__ volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
127 35e5fb06 Romain Dolbeau
128 e3905ce0 Diego Biurrun
    for (i = 0; i < 1024 ; i ++) {
129
        if (fakedata[i] == (char)0)
130
            count++;
131
    }
132 35e5fb06 Romain Dolbeau
133 e3905ce0 Diego Biurrun
    av_free(fakedata);
134 115329f1 Diego Biurrun
135 e3905ce0 Diego Biurrun
    return count;
136 35e5fb06 Romain Dolbeau
}
137 a4adb608 Michael Niedermayer
#else
138 ddb8c2c0 Måns Rullgård
static long check_dcbzl_effect(void)
139 a4adb608 Michael Niedermayer
{
140
  return 0;
141
}
142
#endif
143 35e5fb06 Romain Dolbeau
144 a5db5bda Luca Barbato
static void prefetch_ppc(void *mem, int stride, int h)
145
{
146
    register const uint8_t *p = mem;
147
    do {
148 be449fca Diego Pettenò
        __asm__ volatile ("dcbt 0,%0" : : "r" (p));
149 a5db5bda Luca Barbato
        p+= stride;
150
    } while(--h);
151
}
152
153 b0368839 Michael Niedermayer
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
154 ab6c65f6 Brian Foley
{
155 a1d0b6a2 Diego Biurrun
    // Common optimizations whether AltiVec is available or not
156 a5db5bda Luca Barbato
    c->prefetch = prefetch_ppc;
157 73e4ff9d Luca Barbato
    switch (check_dcbzl_effect()) {
158
        case 32:
159
            c->clear_blocks = clear_blocks_dcbz32_ppc;
160
            break;
161
        case 128:
162
            c->clear_blocks = clear_blocks_dcbz128_ppc;
163
            break;
164
        default:
165
            break;
166
    }
167 a6a12a8a Romain Dolbeau
168 b250f9c6 Aurelien Jacobs
#if HAVE_ALTIVEC
169 49fb20cb Aurelien Jacobs
    if(CONFIG_H264_DECODER) dsputil_h264_init_ppc(c, avctx);
170 115329f1 Diego Biurrun
171 ccf22d3e Stefano Sabatini
    if (mm_support() & AV_CPU_FLAG_ALTIVEC) {
172 73e4ff9d Luca Barbato
        dsputil_init_altivec(c, avctx);
173 9be6f0d2 Diego Biurrun
        if(CONFIG_VC1_DECODER)
174 8cff89be Diego Biurrun
            vc1dsp_init_altivec(c, avctx);
175 241807f3 Luca Barbato
        float_init_altivec(c, avctx);
176 1f1aadc3 Luca Barbato
        int_init_altivec(c, avctx);
177 bb270c08 Diego Biurrun
        c->gmc1 = gmc1_altivec;
178 b0368839 Michael Niedermayer
179 b250f9c6 Aurelien Jacobs
#if CONFIG_ENCODERS
180 bb270c08 Diego Biurrun
        if (avctx->dct_algo == FF_DCT_AUTO ||
181 e3905ce0 Diego Biurrun
            avctx->dct_algo == FF_DCT_ALTIVEC) {
182 bb270c08 Diego Biurrun
            c->fdct = fdct_altivec;
183
        }
184 14cabd40 James Klicman
#endif //CONFIG_ENCODERS
185
186 e3905ce0 Diego Biurrun
        if (avctx->lowres==0) {
187
            if ((avctx->idct_algo == FF_IDCT_AUTO) ||
188
                (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
189
                c->idct_put = idct_put_altivec;
190
                c->idct_add = idct_add_altivec;
191
                c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
192 99e5a9d1 Diego Biurrun
            }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
193 454403ba David Conrad
                     avctx->idct_algo==FF_IDCT_VP3){
194
                c->idct_put = ff_vp3_idct_put_altivec;
195
                c->idct_add = ff_vp3_idct_add_altivec;
196
                c->idct     = ff_vp3_idct_altivec;
197
                c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
198 e3905ce0 Diego Biurrun
            }
199 b0368839 Michael Niedermayer
        }
200 115329f1 Diego Biurrun
201 ab6c65f6 Brian Foley
    }
202 75336fc8 Luca Barbato
#endif /* HAVE_ALTIVEC */
203 ab6c65f6 Brian Foley
}