Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / dsputil_ppc.c @ 58c2182d

History | View | Annotate | Download (9.47 KB)

1 05c4072b Michael Niedermayer
/*
2
 * Copyright (c) 2002 Brian Foley
3
 * Copyright (c) 2002 Dieter Shirley
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 */
19
20 ab6c65f6 Brian Foley
#include "../dsputil.h"
21
22 35e5fb06 Romain Dolbeau
#include "dsputil_ppc.h"
23
24 ab6c65f6 Brian Foley
#ifdef HAVE_ALTIVEC
25
#include "dsputil_altivec.h"
26
#endif
27
28 14cabd40 James Klicman
extern void fdct_altivec(int16_t *block);
29 b0368839 Michael Niedermayer
extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
30
extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
31
32 404d2241 Brian Foley
int mm_flags = 0;
33
34 e629ab68 Romain Dolbeau
int mm_support(void)
35
{
36
    int result = 0;
37 3bbd2123 Steven M. Schultz
#ifdef HAVE_ALTIVEC
38 e629ab68 Romain Dolbeau
    if (has_altivec()) {
39
        result |= MM_ALTIVEC;
40
    }
41
#endif /* result */
42
    return result;
43
}
44
45 e45a2872 Romain Dolbeau
#ifdef POWERPC_PERFORMANCE_REPORT
46
unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
47 fe50f385 Romain Dolbeau
/* list below must match enum in dsputil_ppc.h */
48 35e5fb06 Romain Dolbeau
static unsigned char* perfname[] = {
49 68951ecf Gildas Bazin
  "ff_fft_calc_altivec",
50 35e5fb06 Romain Dolbeau
  "gmc1_altivec",
51
  "dct_unquantize_h263_altivec",
52 14cabd40 James Klicman
  "fdct_altivec",
53 35e5fb06 Romain Dolbeau
  "idct_add_altivec",
54
  "idct_put_altivec",
55
  "put_pixels16_altivec",
56
  "avg_pixels16_altivec",
57
  "avg_pixels8_altivec",
58
  "put_pixels8_xy2_altivec",
59 fe50f385 Romain Dolbeau
  "put_no_rnd_pixels8_xy2_altivec",
60
  "put_pixels16_xy2_altivec",
61
  "put_no_rnd_pixels16_xy2_altivec",
62 a4adb608 Michael Niedermayer
  "clear_blocks_dcbz32_ppc",
63
  "clear_blocks_dcbz128_ppc"
64 35e5fb06 Romain Dolbeau
};
65
#include <stdio.h>
66
#endif
67
68 e45a2872 Romain Dolbeau
#ifdef POWERPC_PERFORMANCE_REPORT
69 35e5fb06 Romain Dolbeau
void powerpc_display_perf_report(void)
70
{
71 e45a2872 Romain Dolbeau
  int i, j;
72 aab34ca0 Michael Niedermayer
  av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
73 35e5fb06 Romain Dolbeau
  for(i = 0 ; i < powerpc_perf_total ; i++)
74
  {
75 e45a2872 Romain Dolbeau
    for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
76
      {
77
        if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
78 aab34ca0 Michael Niedermayer
          av_log(NULL, AV_LOG_INFO,
79 e45a2872 Romain Dolbeau
                  " Function \"%s\" (pmc%d):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
80
                  perfname[i],
81
                  j+1,
82
                  perfdata[j][i][powerpc_data_min],
83
                  perfdata[j][i][powerpc_data_max],
84
                  (double)perfdata[j][i][powerpc_data_sum] /
85
                  (double)perfdata[j][i][powerpc_data_num],
86
                  perfdata[j][i][powerpc_data_num]);
87
      }
88 35e5fb06 Romain Dolbeau
  }
89
}
90 e45a2872 Romain Dolbeau
#endif /* POWERPC_PERFORMANCE_REPORT */
91 35e5fb06 Romain Dolbeau
92
/* ***** WARNING ***** WARNING ***** WARNING ***** */
93
/*
94
  clear_blocks_dcbz32_ppc will not work properly
95
  on PowerPC processors with a cache line size
96
  not equal to 32 bytes.
97
  Fortunately all processor used by Apple up to
98
  at least the 7450 (aka second generation G4)
99
  use 32 bytes cache line.
100
  This is due to the use of the 'dcbz' instruction.
101
  It simply clear to zero a single cache line,
102
  so you need to know the cache line size to use it !
103
  It's absurd, but it's fast...
104 a4adb608 Michael Niedermayer

105
  update 24/06/2003 : Apple released yesterday the G5,
106
  with a PPC970. cache line size : 128 bytes. Oups.
107
  The semantic of dcbz was changed, it always clear
108
  32 bytes. so the function below will work, but will
109
  be slow. So I fixed check_dcbz_effect to use dcbzl,
110
  which is defined to clear a cache line (as dcbz before).
111
  So we still can distinguish, and use dcbz (32 bytes)
112
  or dcbzl (one cache line) as required.
113

114
  see <http://developer.apple.com/technotes/tn/tn2087.html>
115
  and <http://developer.apple.com/technotes/tn/tn2086.html>
116 35e5fb06 Romain Dolbeau
*/
117
void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
118
{
119 e45a2872 Romain Dolbeau
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
120 35e5fb06 Romain Dolbeau
    register int misal = ((unsigned long)blocks & 0x00000010);
121
    register int i = 0;
122 e45a2872 Romain Dolbeau
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
123 35e5fb06 Romain Dolbeau
#if 1
124
    if (misal) {
125
      ((unsigned long*)blocks)[0] = 0L;
126
      ((unsigned long*)blocks)[1] = 0L;
127
      ((unsigned long*)blocks)[2] = 0L;
128
      ((unsigned long*)blocks)[3] = 0L;
129
      i += 16;
130
    }
131
    for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) {
132 aab34ca0 Michael Niedermayer
#ifndef __MWERKS__
133 3efd4952 Romain Dolbeau
      asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
134 aab34ca0 Michael Niedermayer
#else
135
      __dcbz( blocks, i );
136
#endif
137 35e5fb06 Romain Dolbeau
    }
138
    if (misal) {
139
      ((unsigned long*)blocks)[188] = 0L;
140
      ((unsigned long*)blocks)[189] = 0L;
141
      ((unsigned long*)blocks)[190] = 0L;
142
      ((unsigned long*)blocks)[191] = 0L;
143
      i += 16;
144
    }
145
#else
146
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
147
#endif
148 e45a2872 Romain Dolbeau
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
149 35e5fb06 Romain Dolbeau
}
150
151 a4adb608 Michael Niedermayer
/* same as above, when dcbzl clear a whole 128B cache line
152
   i.e. the PPC970 aka G5 */
153
#ifndef NO_DCBZL
154
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
155
{
156 e45a2872 Romain Dolbeau
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
157 a4adb608 Michael Niedermayer
    register int misal = ((unsigned long)blocks & 0x0000007f);
158
    register int i = 0;
159 e45a2872 Romain Dolbeau
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
160 a4adb608 Michael Niedermayer
#if 1
161
 if (misal) {
162
   // we could probably also optimize this case,
163
   // but there's not much point as the machines
164
   // aren't available yet (2003-06-26)
165
      memset(blocks, 0, sizeof(DCTELEM)*6*64);
166
    }
167
    else
168
      for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
169 3efd4952 Romain Dolbeau
        asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
170 a4adb608 Michael Niedermayer
      }
171
#else
172
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
173
#endif
174 e45a2872 Romain Dolbeau
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
175 a4adb608 Michael Niedermayer
}
176
#else
177
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
178
{
179
  memset(blocks, 0, sizeof(DCTELEM)*6*64);
180
}
181
#endif
182
183
#ifndef NO_DCBZL
184 35e5fb06 Romain Dolbeau
/* check dcbz report how many bytes are set to 0 by dcbz */
185 a4adb608 Michael Niedermayer
/* update 24/06/2003 : replace dcbz by dcbzl to get
186
   the intended effect (Apple "fixed" dcbz)
187
   unfortunately this cannot be used unless the assembler
188
   knows about dcbzl ... */
189
long check_dcbzl_effect(void)
190 35e5fb06 Romain Dolbeau
{
191 3b991c54 Romain Dolbeau
  register char *fakedata = (char*)av_malloc(1024);
192 35e5fb06 Romain Dolbeau
  register char *fakedata_middle;
193
  register long zero = 0;
194
  register long i = 0;
195
  long count = 0;
196
197 3b991c54 Romain Dolbeau
  if (!fakedata)
198 35e5fb06 Romain Dolbeau
  {
199
    return 0L;
200
  }
201
202
  fakedata_middle = (fakedata + 512);
203
204
  memset(fakedata, 0xFF, 1024);
205
206 3efd4952 Romain Dolbeau
  /* below the constraint "b" seems to mean "Address base register"
207
     in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
208
  asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
209 35e5fb06 Romain Dolbeau
210
  for (i = 0; i < 1024 ; i ++)
211
  {
212
    if (fakedata[i] == (char)0)
213
      count++;
214
  }
215
216 3b991c54 Romain Dolbeau
  av_free(fakedata);
217 35e5fb06 Romain Dolbeau
  
218
  return count;
219
}
220 a4adb608 Michael Niedermayer
#else
221
long check_dcbzl_effect(void)
222
{
223
  return 0;
224
}
225
#endif
226 35e5fb06 Romain Dolbeau
227 b0368839 Michael Niedermayer
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
228 ab6c65f6 Brian Foley
{
229 a4adb608 Michael Niedermayer
    // Common optimizations whether Altivec is available or not
230 05c4072b Michael Niedermayer
231 a4adb608 Michael Niedermayer
  switch (check_dcbzl_effect()) {
232 35e5fb06 Romain Dolbeau
  case 32:
233
    c->clear_blocks = clear_blocks_dcbz32_ppc;
234
    break;
235 a4adb608 Michael Niedermayer
  case 128:
236
    c->clear_blocks = clear_blocks_dcbz128_ppc;
237
    break;
238 35e5fb06 Romain Dolbeau
  default:
239
    break;
240
  }
241
  
242 3bbd2123 Steven M. Schultz
#ifdef HAVE_ALTIVEC
243 ab6c65f6 Brian Foley
    if (has_altivec()) {
244 404d2241 Brian Foley
        mm_flags |= MM_ALTIVEC;
245
        
246 05c4072b Michael Niedermayer
        // Altivec specific optimisations
247 bb198e19 Michael Niedermayer
        c->pix_abs[0][1] = sad16_x2_altivec;
248
        c->pix_abs[0][2] = sad16_y2_altivec;
249
        c->pix_abs[0][3] = sad16_xy2_altivec;
250
        c->pix_abs[0][0] = sad16_altivec;
251
        c->pix_abs[1][0] = sad8_altivec;
252
        c->sad[0]= sad16_altivec;
253
        c->sad[1]= sad8_altivec;
254 f2677d6b Brian Foley
        c->pix_norm1 = pix_norm1_altivec;
255 4013fcf4 Fabrice Bellard
        c->sse[1]= sse8_altivec;
256
        c->sse[0]= sse16_altivec;
257 af19f78f Zdenek Kabelac
        c->pix_sum = pix_sum_altivec;
258
        c->diff_pixels = diff_pixels_altivec;
259
        c->get_pixels = get_pixels_altivec;
260 fe50f385 Romain Dolbeau
// next one disabled as it's untested.
261 e629ab68 Romain Dolbeau
#if 0
262
        c->add_bytes= add_bytes_altivec;
263 fe50f385 Romain Dolbeau
#endif /* 0 */
264 db40a39a Michael Niedermayer
        c->put_pixels_tab[0][0] = put_pixels16_altivec;
265 e45a2872 Romain Dolbeau
        /* the tow functions do the same thing, so use the same code */
266
        c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
267 db40a39a Michael Niedermayer
        c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
268 35e5fb06 Romain Dolbeau
// next one disabled as it's untested.
269
#if 0
270
        c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
271 fe50f385 Romain Dolbeau
#endif /* 0 */
272 35e5fb06 Romain Dolbeau
        c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
273 fe50f385 Romain Dolbeau
        c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
274
        c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
275
        c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
276 35e5fb06 Romain Dolbeau
        
277 e629ab68 Romain Dolbeau
        c->gmc1 = gmc1_altivec;
278 b0368839 Michael Niedermayer
279 14cabd40 James Klicman
#ifdef CONFIG_ENCODERS
280
        if (avctx->dct_algo == FF_DCT_AUTO ||
281
            avctx->dct_algo == FF_DCT_ALTIVEC)
282
        {
283
            c->fdct = fdct_altivec;
284
        }
285
#endif //CONFIG_ENCODERS
286
287 b0368839 Michael Niedermayer
        if ((avctx->idct_algo == FF_IDCT_AUTO) ||
288
                (avctx->idct_algo == FF_IDCT_ALTIVEC))
289
        {
290
            c->idct_put = idct_put_altivec;
291
            c->idct_add = idct_add_altivec;
292
#ifndef ALTIVEC_USE_REFERENCE_C_CODE
293
            c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
294
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
295
            c->idct_permutation_type = FF_NO_IDCT_PERM;
296
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
297
        }
298 fe50f385 Romain Dolbeau
        
299 e45a2872 Romain Dolbeau
#ifdef POWERPC_PERFORMANCE_REPORT
300 db40a39a Michael Niedermayer
        {
301 e45a2872 Romain Dolbeau
          int i, j;
302 35e5fb06 Romain Dolbeau
          for (i = 0 ; i < powerpc_perf_total ; i++)
303 db40a39a Michael Niedermayer
          {
304 e45a2872 Romain Dolbeau
            for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
305
              {
306
                perfdata[j][i][powerpc_data_min] = (unsigned long long)0xFFFFFFFFFFFFFFFF;
307
                perfdata[j][i][powerpc_data_max] = (unsigned long long)0x0000000000000000;
308
                perfdata[j][i][powerpc_data_sum] = (unsigned long long)0x0000000000000000;
309
                perfdata[j][i][powerpc_data_num] = (unsigned long long)0x0000000000000000;
310
              }
311
          }
312 db40a39a Michael Niedermayer
        }
313 e45a2872 Romain Dolbeau
#endif /* POWERPC_PERFORMANCE_REPORT */
314 ab6c65f6 Brian Foley
    } else
315 fe50f385 Romain Dolbeau
#endif /* HAVE_ALTIVEC */
316 ab6c65f6 Brian Foley
    {
317 05c4072b Michael Niedermayer
        // Non-AltiVec PPC optimisations
318
319
        // ... pending ...
320 ab6c65f6 Brian Foley
    }
321
}