Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / dsputil_ppc.c @ 14cabd40

History | View | Annotate | Download (9.43 KB)

1
/*
2
 * Copyright (c) 2002 Brian Foley
3
 * Copyright (c) 2002 Dieter Shirley
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 */
19

    
20
#include "../dsputil.h"
21

    
22
#include "dsputil_ppc.h"
23

    
24
#ifdef HAVE_ALTIVEC
25
#include "dsputil_altivec.h"
26
#endif
27

    
28
extern void fdct_altivec(int16_t *block);
29
extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
30
extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
31

    
32
int mm_flags = 0;
33

    
34
int mm_support(void)
35
{
36
    int result = 0;
37
#ifdef HAVE_ALTIVEC
38
    if (has_altivec()) {
39
        result |= MM_ALTIVEC;
40
    }
41
#endif /* result */
42
    return result;
43
}
44

    
45
#ifdef POWERPC_PERFORMANCE_REPORT
46
unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
47
/* list below must match enum in dsputil_ppc.h */
48
static unsigned char* perfname[] = {
49
  "fft_calc_altivec",
50
  "gmc1_altivec",
51
  "dct_unquantize_h263_altivec",
52
  "fdct_altivec",
53
  "idct_add_altivec",
54
  "idct_put_altivec",
55
  "put_pixels16_altivec",
56
  "avg_pixels16_altivec",
57
  "avg_pixels8_altivec",
58
  "put_pixels8_xy2_altivec",
59
  "put_no_rnd_pixels8_xy2_altivec",
60
  "put_pixels16_xy2_altivec",
61
  "put_no_rnd_pixels16_xy2_altivec",
62
  "clear_blocks_dcbz32_ppc",
63
  "clear_blocks_dcbz128_ppc"
64
};
65
#include <stdio.h>
66
#endif
67

    
68
#ifdef POWERPC_PERFORMANCE_REPORT
69
void powerpc_display_perf_report(void)
70
{
71
  int i, j;
72
  fprintf(stderr, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
73
  for(i = 0 ; i < powerpc_perf_total ; i++)
74
  {
75
    for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
76
      {
77
        if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
78
          fprintf(stderr,
79
                  " Function \"%s\" (pmc%d):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
80
                  perfname[i],
81
                  j+1,
82
                  perfdata[j][i][powerpc_data_min],
83
                  perfdata[j][i][powerpc_data_max],
84
                  (double)perfdata[j][i][powerpc_data_sum] /
85
                  (double)perfdata[j][i][powerpc_data_num],
86
                  perfdata[j][i][powerpc_data_num]);
87
      }
88
  }
89
}
90
#endif /* POWERPC_PERFORMANCE_REPORT */
91

    
92
/* ***** WARNING ***** WARNING ***** WARNING ***** */
93
/*
94
  clear_blocks_dcbz32_ppc will not work properly
95
  on PowerPC processors with a cache line size
96
  not equal to 32 bytes.
97
  Fortunately all processor used by Apple up to
98
  at least the 7450 (aka second generation G4)
99
  use 32 bytes cache line.
100
  This is due to the use of the 'dcbz' instruction.
101
  It simply clear to zero a single cache line,
102
  so you need to know the cache line size to use it !
103
  It's absurd, but it's fast...
104

105
  update 24/06/2003 : Apple released yesterday the G5,
106
  with a PPC970. cache line size : 128 bytes. Oups.
107
  The semantic of dcbz was changed, it always clear
108
  32 bytes. so the function below will work, but will
109
  be slow. So I fixed check_dcbz_effect to use dcbzl,
110
  which is defined to clear a cache line (as dcbz before).
111
  So we still can distinguish, and use dcbz (32 bytes)
112
  or dcbzl (one cache line) as required.
113

114
  see <http://developer.apple.com/technotes/tn/tn2087.html>
115
  and <http://developer.apple.com/technotes/tn/tn2086.html>
116
*/
117
void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
118
{
119
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
120
    register int misal = ((unsigned long)blocks & 0x00000010);
121
    register int i = 0;
122
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
123
#if 1
124
    if (misal) {
125
      ((unsigned long*)blocks)[0] = 0L;
126
      ((unsigned long*)blocks)[1] = 0L;
127
      ((unsigned long*)blocks)[2] = 0L;
128
      ((unsigned long*)blocks)[3] = 0L;
129
      i += 16;
130
    }
131
    for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) {
132
      asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
133
    }
134
    if (misal) {
135
      ((unsigned long*)blocks)[188] = 0L;
136
      ((unsigned long*)blocks)[189] = 0L;
137
      ((unsigned long*)blocks)[190] = 0L;
138
      ((unsigned long*)blocks)[191] = 0L;
139
      i += 16;
140
    }
141
#else
142
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
143
#endif
144
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
145
}
146

    
147
/* same as above, when dcbzl clear a whole 128B cache line
148
   i.e. the PPC970 aka G5 */
149
#ifndef NO_DCBZL
150
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
151
{
152
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
153
    register int misal = ((unsigned long)blocks & 0x0000007f);
154
    register int i = 0;
155
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
156
#if 1
157
 if (misal) {
158
   // we could probably also optimize this case,
159
   // but there's not much point as the machines
160
   // aren't available yet (2003-06-26)
161
      memset(blocks, 0, sizeof(DCTELEM)*6*64);
162
    }
163
    else
164
      for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
165
        asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
166
      }
167
#else
168
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
169
#endif
170
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
171
}
172
#else
173
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
174
{
175
  memset(blocks, 0, sizeof(DCTELEM)*6*64);
176
}
177
#endif
178

    
179
#ifndef NO_DCBZL
180
/* check dcbz report how many bytes are set to 0 by dcbz */
181
/* update 24/06/2003 : replace dcbz by dcbzl to get
182
   the intended effect (Apple "fixed" dcbz)
183
   unfortunately this cannot be used unless the assembler
184
   knows about dcbzl ... */
185
long check_dcbzl_effect(void)
186
{
187
  register char *fakedata = (char*)av_malloc(1024);
188
  register char *fakedata_middle;
189
  register long zero = 0;
190
  register long i = 0;
191
  long count = 0;
192

    
193
  if (!fakedata)
194
  {
195
    return 0L;
196
  }
197

    
198
  fakedata_middle = (fakedata + 512);
199

    
200
  memset(fakedata, 0xFF, 1024);
201

    
202
  /* below the constraint "b" seems to mean "Address base register"
203
     in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
204
  asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
205

    
206
  for (i = 0; i < 1024 ; i ++)
207
  {
208
    if (fakedata[i] == (char)0)
209
      count++;
210
  }
211

    
212
  av_free(fakedata);
213
  
214
  return count;
215
}
216
#else
217
long check_dcbzl_effect(void)
218
{
219
  return 0;
220
}
221
#endif
222

    
223
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
224
{
225
    // Common optimizations whether Altivec is available or not
226

    
227
  switch (check_dcbzl_effect()) {
228
  case 32:
229
    c->clear_blocks = clear_blocks_dcbz32_ppc;
230
    break;
231
  case 128:
232
    c->clear_blocks = clear_blocks_dcbz128_ppc;
233
    break;
234
  default:
235
    break;
236
  }
237
  
238
#ifdef HAVE_ALTIVEC
239
    if (has_altivec()) {
240
        mm_flags |= MM_ALTIVEC;
241
        
242
        // Altivec specific optimisations
243
        c->pix_abs16x16_x2 = pix_abs16x16_x2_altivec;
244
        c->pix_abs16x16_y2 = pix_abs16x16_y2_altivec;
245
        c->pix_abs16x16_xy2 = pix_abs16x16_xy2_altivec;
246
        c->pix_abs16x16 = pix_abs16x16_altivec;
247
        c->pix_abs8x8 = pix_abs8x8_altivec;
248
        c->sad[0]= sad16x16_altivec;
249
        c->sad[1]= sad8x8_altivec;
250
        c->pix_norm1 = pix_norm1_altivec;
251
        c->sse[1]= sse8_altivec;
252
        c->sse[0]= sse16_altivec;
253
        c->pix_sum = pix_sum_altivec;
254
        c->diff_pixels = diff_pixels_altivec;
255
        c->get_pixels = get_pixels_altivec;
256
// next one disabled as it's untested.
257
#if 0
258
        c->add_bytes= add_bytes_altivec;
259
#endif /* 0 */
260
        c->put_pixels_tab[0][0] = put_pixels16_altivec;
261
        /* the tow functions do the same thing, so use the same code */
262
        c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
263
        c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
264
// next one disabled as it's untested.
265
#if 0
266
        c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
267
#endif /* 0 */
268
        c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
269
        c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
270
        c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
271
        c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
272
        
273
        c->gmc1 = gmc1_altivec;
274

    
275
#ifdef CONFIG_ENCODERS
276
        if (avctx->dct_algo == FF_DCT_AUTO ||
277
            avctx->dct_algo == FF_DCT_ALTIVEC)
278
        {
279
            c->fdct = fdct_altivec;
280
        }
281
#endif //CONFIG_ENCODERS
282

    
283
        if ((avctx->idct_algo == FF_IDCT_AUTO) ||
284
                (avctx->idct_algo == FF_IDCT_ALTIVEC))
285
        {
286
            c->idct_put = idct_put_altivec;
287
            c->idct_add = idct_add_altivec;
288
#ifndef ALTIVEC_USE_REFERENCE_C_CODE
289
            c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
290
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
291
            c->idct_permutation_type = FF_NO_IDCT_PERM;
292
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
293
        }
294
        
295
#ifdef POWERPC_PERFORMANCE_REPORT
296
        {
297
          int i, j;
298
          for (i = 0 ; i < powerpc_perf_total ; i++)
299
          {
300
            for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
301
              {
302
                perfdata[j][i][powerpc_data_min] = (unsigned long long)0xFFFFFFFFFFFFFFFF;
303
                perfdata[j][i][powerpc_data_max] = (unsigned long long)0x0000000000000000;
304
                perfdata[j][i][powerpc_data_sum] = (unsigned long long)0x0000000000000000;
305
                perfdata[j][i][powerpc_data_num] = (unsigned long long)0x0000000000000000;
306
              }
307
          }
308
        }
309
#endif /* POWERPC_PERFORMANCE_REPORT */
310
    } else
311
#endif /* HAVE_ALTIVEC */
312
    {
313
        // Non-AltiVec PPC optimisations
314

    
315
        // ... pending ...
316
    }
317
}