Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / dsputil_ppc.c @ 58c2182d

History | View | Annotate | Download (9.47 KB)

1
/*
2
 * Copyright (c) 2002 Brian Foley
3
 * Copyright (c) 2002 Dieter Shirley
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 */
19

    
20
#include "../dsputil.h"
21

    
22
#include "dsputil_ppc.h"
23

    
24
#ifdef HAVE_ALTIVEC
25
#include "dsputil_altivec.h"
26
#endif
27

    
28
extern void fdct_altivec(int16_t *block);
29
extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
30
extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
31

    
32
int mm_flags = 0;
33

    
34
int mm_support(void)
35
{
36
    int result = 0;
37
#ifdef HAVE_ALTIVEC
38
    if (has_altivec()) {
39
        result |= MM_ALTIVEC;
40
    }
41
#endif /* result */
42
    return result;
43
}
44

    
45
#ifdef POWERPC_PERFORMANCE_REPORT
46
unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
47
/* list below must match enum in dsputil_ppc.h */
48
static unsigned char* perfname[] = {
49
  "ff_fft_calc_altivec",
50
  "gmc1_altivec",
51
  "dct_unquantize_h263_altivec",
52
  "fdct_altivec",
53
  "idct_add_altivec",
54
  "idct_put_altivec",
55
  "put_pixels16_altivec",
56
  "avg_pixels16_altivec",
57
  "avg_pixels8_altivec",
58
  "put_pixels8_xy2_altivec",
59
  "put_no_rnd_pixels8_xy2_altivec",
60
  "put_pixels16_xy2_altivec",
61
  "put_no_rnd_pixels16_xy2_altivec",
62
  "clear_blocks_dcbz32_ppc",
63
  "clear_blocks_dcbz128_ppc"
64
};
65
#include <stdio.h>
66
#endif
67

    
68
#ifdef POWERPC_PERFORMANCE_REPORT
69
void powerpc_display_perf_report(void)
70
{
71
  int i, j;
72
  av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
73
  for(i = 0 ; i < powerpc_perf_total ; i++)
74
  {
75
    for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
76
      {
77
        if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
78
          av_log(NULL, AV_LOG_INFO,
79
                  " Function \"%s\" (pmc%d):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
80
                  perfname[i],
81
                  j+1,
82
                  perfdata[j][i][powerpc_data_min],
83
                  perfdata[j][i][powerpc_data_max],
84
                  (double)perfdata[j][i][powerpc_data_sum] /
85
                  (double)perfdata[j][i][powerpc_data_num],
86
                  perfdata[j][i][powerpc_data_num]);
87
      }
88
  }
89
}
90
#endif /* POWERPC_PERFORMANCE_REPORT */
91

    
92
/* ***** WARNING ***** WARNING ***** WARNING ***** */
93
/*
94
  clear_blocks_dcbz32_ppc will not work properly
95
  on PowerPC processors with a cache line size
96
  not equal to 32 bytes.
97
  Fortunately all processor used by Apple up to
98
  at least the 7450 (aka second generation G4)
99
  use 32 bytes cache line.
100
  This is due to the use of the 'dcbz' instruction.
101
  It simply clear to zero a single cache line,
102
  so you need to know the cache line size to use it !
103
  It's absurd, but it's fast...
104

105
  update 24/06/2003 : Apple released yesterday the G5,
106
  with a PPC970. cache line size : 128 bytes. Oups.
107
  The semantic of dcbz was changed, it always clear
108
  32 bytes. so the function below will work, but will
109
  be slow. So I fixed check_dcbz_effect to use dcbzl,
110
  which is defined to clear a cache line (as dcbz before).
111
  So we still can distinguish, and use dcbz (32 bytes)
112
  or dcbzl (one cache line) as required.
113

114
  see <http://developer.apple.com/technotes/tn/tn2087.html>
115
  and <http://developer.apple.com/technotes/tn/tn2086.html>
116
*/
117
void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
118
{
119
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
120
    register int misal = ((unsigned long)blocks & 0x00000010);
121
    register int i = 0;
122
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
123
#if 1
124
    if (misal) {
125
      ((unsigned long*)blocks)[0] = 0L;
126
      ((unsigned long*)blocks)[1] = 0L;
127
      ((unsigned long*)blocks)[2] = 0L;
128
      ((unsigned long*)blocks)[3] = 0L;
129
      i += 16;
130
    }
131
    for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) {
132
#ifndef __MWERKS__
133
      asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
134
#else
135
      __dcbz( blocks, i );
136
#endif
137
    }
138
    if (misal) {
139
      ((unsigned long*)blocks)[188] = 0L;
140
      ((unsigned long*)blocks)[189] = 0L;
141
      ((unsigned long*)blocks)[190] = 0L;
142
      ((unsigned long*)blocks)[191] = 0L;
143
      i += 16;
144
    }
145
#else
146
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
147
#endif
148
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
149
}
150

    
151
/* same as above, when dcbzl clear a whole 128B cache line
152
   i.e. the PPC970 aka G5 */
153
#ifndef NO_DCBZL
154
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
155
{
156
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
157
    register int misal = ((unsigned long)blocks & 0x0000007f);
158
    register int i = 0;
159
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
160
#if 1
161
 if (misal) {
162
   // we could probably also optimize this case,
163
   // but there's not much point as the machines
164
   // aren't available yet (2003-06-26)
165
      memset(blocks, 0, sizeof(DCTELEM)*6*64);
166
    }
167
    else
168
      for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
169
        asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
170
      }
171
#else
172
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
173
#endif
174
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
175
}
176
#else
177
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
178
{
179
  memset(blocks, 0, sizeof(DCTELEM)*6*64);
180
}
181
#endif
182

    
183
#ifndef NO_DCBZL
184
/* check dcbz report how many bytes are set to 0 by dcbz */
185
/* update 24/06/2003 : replace dcbz by dcbzl to get
186
   the intended effect (Apple "fixed" dcbz)
187
   unfortunately this cannot be used unless the assembler
188
   knows about dcbzl ... */
189
long check_dcbzl_effect(void)
190
{
191
  register char *fakedata = (char*)av_malloc(1024);
192
  register char *fakedata_middle;
193
  register long zero = 0;
194
  register long i = 0;
195
  long count = 0;
196

    
197
  if (!fakedata)
198
  {
199
    return 0L;
200
  }
201

    
202
  fakedata_middle = (fakedata + 512);
203

    
204
  memset(fakedata, 0xFF, 1024);
205

    
206
  /* below the constraint "b" seems to mean "Address base register"
207
     in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
208
  asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
209

    
210
  for (i = 0; i < 1024 ; i ++)
211
  {
212
    if (fakedata[i] == (char)0)
213
      count++;
214
  }
215

    
216
  av_free(fakedata);
217
  
218
  return count;
219
}
220
#else
221
long check_dcbzl_effect(void)
222
{
223
  return 0;
224
}
225
#endif
226

    
227
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
228
{
229
    // Common optimizations whether Altivec is available or not
230

    
231
  switch (check_dcbzl_effect()) {
232
  case 32:
233
    c->clear_blocks = clear_blocks_dcbz32_ppc;
234
    break;
235
  case 128:
236
    c->clear_blocks = clear_blocks_dcbz128_ppc;
237
    break;
238
  default:
239
    break;
240
  }
241
  
242
#ifdef HAVE_ALTIVEC
243
    if (has_altivec()) {
244
        mm_flags |= MM_ALTIVEC;
245
        
246
        // Altivec specific optimisations
247
        c->pix_abs[0][1] = sad16_x2_altivec;
248
        c->pix_abs[0][2] = sad16_y2_altivec;
249
        c->pix_abs[0][3] = sad16_xy2_altivec;
250
        c->pix_abs[0][0] = sad16_altivec;
251
        c->pix_abs[1][0] = sad8_altivec;
252
        c->sad[0]= sad16_altivec;
253
        c->sad[1]= sad8_altivec;
254
        c->pix_norm1 = pix_norm1_altivec;
255
        c->sse[1]= sse8_altivec;
256
        c->sse[0]= sse16_altivec;
257
        c->pix_sum = pix_sum_altivec;
258
        c->diff_pixels = diff_pixels_altivec;
259
        c->get_pixels = get_pixels_altivec;
260
// next one disabled as it's untested.
261
#if 0
262
        c->add_bytes= add_bytes_altivec;
263
#endif /* 0 */
264
        c->put_pixels_tab[0][0] = put_pixels16_altivec;
265
        /* the tow functions do the same thing, so use the same code */
266
        c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
267
        c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
268
// next one disabled as it's untested.
269
#if 0
270
        c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
271
#endif /* 0 */
272
        c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
273
        c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
274
        c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
275
        c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
276
        
277
        c->gmc1 = gmc1_altivec;
278

    
279
#ifdef CONFIG_ENCODERS
280
        if (avctx->dct_algo == FF_DCT_AUTO ||
281
            avctx->dct_algo == FF_DCT_ALTIVEC)
282
        {
283
            c->fdct = fdct_altivec;
284
        }
285
#endif //CONFIG_ENCODERS
286

    
287
        if ((avctx->idct_algo == FF_IDCT_AUTO) ||
288
                (avctx->idct_algo == FF_IDCT_ALTIVEC))
289
        {
290
            c->idct_put = idct_put_altivec;
291
            c->idct_add = idct_add_altivec;
292
#ifndef ALTIVEC_USE_REFERENCE_C_CODE
293
            c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
294
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
295
            c->idct_permutation_type = FF_NO_IDCT_PERM;
296
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
297
        }
298
        
299
#ifdef POWERPC_PERFORMANCE_REPORT
300
        {
301
          int i, j;
302
          for (i = 0 ; i < powerpc_perf_total ; i++)
303
          {
304
            for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
305
              {
306
                perfdata[j][i][powerpc_data_min] = (unsigned long long)0xFFFFFFFFFFFFFFFF;
307
                perfdata[j][i][powerpc_data_max] = (unsigned long long)0x0000000000000000;
308
                perfdata[j][i][powerpc_data_sum] = (unsigned long long)0x0000000000000000;
309
                perfdata[j][i][powerpc_data_num] = (unsigned long long)0x0000000000000000;
310
              }
311
          }
312
        }
313
#endif /* POWERPC_PERFORMANCE_REPORT */
314
    } else
315
#endif /* HAVE_ALTIVEC */
316
    {
317
        // Non-AltiVec PPC optimisations
318

    
319
        // ... pending ...
320
    }
321
}