Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dct-test.c @ 2912e87a

History | View | Annotate | Download (16.4 KB)

1
/*
2
 * (c) 2001 Fabrice Bellard
3
 *     2007 Marc Hoffman <marc.hoffman@analog.com>
4
 *
5
 * This file is part of Libav.
6
 *
7
 * Libav is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * Libav is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with Libav; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

    
22
/**
23
 * @file
24
 * DCT test (c) 2001 Fabrice Bellard
25
 * Started from sample code by Juan J. Sierralta P.
26
 */
27

    
28
#include <stdlib.h>
29
#include <stdio.h>
30
#include <string.h>
31
#include <sys/time.h>
32
#include <unistd.h>
33
#include <math.h>
34

    
35
#include "libavutil/cpu.h"
36
#include "libavutil/common.h"
37
#include "libavutil/lfg.h"
38

    
39
#include "simple_idct.h"
40
#include "aandcttab.h"
41
#include "faandct.h"
42
#include "faanidct.h"
43
#include "x86/idct_xvid.h"
44
#include "dctref.h"
45

    
46
#undef printf
47

    
48
void ff_mmx_idct(DCTELEM *data);
49
void ff_mmxext_idct(DCTELEM *data);
50

    
51
void odivx_idct_c(short *block);
52

    
53
// BFIN
54
void ff_bfin_idct(DCTELEM *block);
55
void ff_bfin_fdct(DCTELEM *block);
56

    
57
// ALTIVEC
58
void fdct_altivec(DCTELEM *block);
59
//void idct_altivec(DCTELEM *block);?? no routine
60

    
61
// ARM
62
void ff_j_rev_dct_arm(DCTELEM *data);
63
void ff_simple_idct_arm(DCTELEM *data);
64
void ff_simple_idct_armv5te(DCTELEM *data);
65
void ff_simple_idct_armv6(DCTELEM *data);
66
void ff_simple_idct_neon(DCTELEM *data);
67

    
68
void ff_simple_idct_axp(DCTELEM *data);
69

    
70
struct algo {
71
  const char *name;
72
  enum { FDCT, IDCT } is_idct;
73
  void (* func) (DCTELEM *block);
74
  void (* ref)  (DCTELEM *block);
75
  enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format;
76
  int  mm_support;
77
};
78

    
79
#ifndef FAAN_POSTSCALE
80
#define FAAN_SCALE SCALE_PERM
81
#else
82
#define FAAN_SCALE NO_PERM
83
#endif
84

    
85
static int cpu_flags;
86

    
87
struct algo algos[] = {
88
  {"REF-DBL",         0, ff_ref_fdct,        ff_ref_fdct, NO_PERM},
89
  {"FAAN",            0, ff_faandct,         ff_ref_fdct, FAAN_SCALE},
90
  {"FAANI",           1, ff_faanidct,        ff_ref_idct, NO_PERM},
91
  {"IJG-AAN-INT",     0, fdct_ifast,         ff_ref_fdct, SCALE_PERM},
92
  {"IJG-LLM-INT",     0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM},
93
  {"REF-DBL",         1, ff_ref_idct,        ff_ref_idct, NO_PERM},
94
  {"INT",             1, j_rev_dct,          ff_ref_idct, MMX_PERM},
95
  {"SIMPLE-C",        1, ff_simple_idct,     ff_ref_idct, NO_PERM},
96

    
97
#if HAVE_MMX
98
  {"MMX",             0, ff_fdct_mmx,        ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX},
99
#if HAVE_MMX2
100
  {"MMX2",            0, ff_fdct_mmx2,       ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX2},
101
  {"SSE2",            0, ff_fdct_sse2,       ff_ref_fdct, NO_PERM, AV_CPU_FLAG_SSE2},
102
#endif
103

    
104
#if CONFIG_GPL
105
  {"LIBMPEG2-MMX",    1, ff_mmx_idct,        ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX},
106
  {"LIBMPEG2-MMX2",   1, ff_mmxext_idct,     ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX2},
107
#endif
108
  {"SIMPLE-MMX",      1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX},
109
  {"XVID-MMX",        1, ff_idct_xvid_mmx,   ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX},
110
  {"XVID-MMX2",       1, ff_idct_xvid_mmx2,  ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX2},
111
  {"XVID-SSE2",       1, ff_idct_xvid_sse2,  ff_ref_idct, SSE2_PERM, AV_CPU_FLAG_SSE2},
112
#endif
113

    
114
#if HAVE_ALTIVEC
115
  {"altivecfdct",     0, fdct_altivec,       ff_ref_fdct, NO_PERM, AV_CPU_FLAG_ALTIVEC},
116
#endif
117

    
118
#if ARCH_BFIN
119
  {"BFINfdct",        0, ff_bfin_fdct,       ff_ref_fdct, NO_PERM},
120
  {"BFINidct",        1, ff_bfin_idct,       ff_ref_idct, NO_PERM},
121
#endif
122

    
123
#if ARCH_ARM
124
  {"SIMPLE-ARM",      1, ff_simple_idct_arm, ff_ref_idct, NO_PERM },
125
  {"INT-ARM",         1, ff_j_rev_dct_arm,   ff_ref_idct, MMX_PERM },
126
#if HAVE_ARMV5TE
127
  {"SIMPLE-ARMV5TE",  1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM },
128
#endif
129
#if HAVE_ARMV6
130
  {"SIMPLE-ARMV6",    1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM },
131
#endif
132
#if HAVE_NEON
133
  {"SIMPLE-NEON",     1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM },
134
#endif
135
#endif /* ARCH_ARM */
136

    
137
#if ARCH_ALPHA
138
  {"SIMPLE-ALPHA",    1, ff_simple_idct_axp,  ff_ref_idct, NO_PERM },
139
#endif
140

    
141
  { 0 }
142
};
143

    
144
#define AANSCALE_BITS 12
145

    
146
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
147

    
148
static int64_t gettime(void)
149
{
150
    struct timeval tv;
151
    gettimeofday(&tv,NULL);
152
    return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
153
}
154

    
155
#define NB_ITS 20000
156
#define NB_ITS_SPEED 50000
157

    
158
static short idct_mmx_perm[64];
159

    
160
static short idct_simple_mmx_perm[64]={
161
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
162
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
163
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
164
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
165
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
166
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
167
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
168
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
169
};
170

    
171
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
172

    
173
static void idct_mmx_init(void)
174
{
175
    int i;
176

    
177
    /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
178
    for (i = 0; i < 64; i++) {
179
        idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
180
//        idct_simple_mmx_perm[i] = simple_block_permute_op(i);
181
    }
182
}
183

    
184
DECLARE_ALIGNED(16, static DCTELEM, block)[64];
185
DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
186
DECLARE_ALIGNED(8, static DCTELEM, block_org)[64];
187

    
188
static inline void mmx_emms(void)
189
{
190
#if HAVE_MMX
191
    if (cpu_flags & AV_CPU_FLAG_MMX)
192
        __asm__ volatile ("emms\n\t");
193
#endif
194
}
195

    
196
static void dct_error(const char *name, int is_idct,
197
               void (*fdct_func)(DCTELEM *block),
198
               void (*fdct_ref)(DCTELEM *block), int form, int test)
199
{
200
    int it, i, scale;
201
    int err_inf, v;
202
    int64_t err2, ti, ti1, it1;
203
    int64_t sysErr[64], sysErrMax=0;
204
    int maxout=0;
205
    int blockSumErrMax=0, blockSumErr;
206
    AVLFG prng;
207

    
208
    av_lfg_init(&prng, 1);
209

    
210
    err_inf = 0;
211
    err2 = 0;
212
    for(i=0; i<64; i++) sysErr[i]=0;
213
    for(it=0;it<NB_ITS;it++) {
214
        for(i=0;i<64;i++)
215
            block1[i] = 0;
216
        switch(test){
217
        case 0:
218
            for(i=0;i<64;i++)
219
                block1[i] = (av_lfg_get(&prng) % 512) -256;
220
            if (is_idct){
221
                ff_ref_fdct(block1);
222

    
223
                for(i=0;i<64;i++)
224
                    block1[i]>>=3;
225
            }
226
        break;
227
        case 1:{
228
            int num = av_lfg_get(&prng) % 10 + 1;
229
            for(i=0;i<num;i++)
230
                block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % 512 -256;
231
        }break;
232
        case 2:
233
            block1[0] = av_lfg_get(&prng) % 4096 - 2048;
234
            block1[63]= (block1[0]&1)^1;
235
        break;
236
        }
237

    
238
#if 0 // simulate mismatch control
239
{ int sum=0;
240
        for(i=0;i<64;i++)
241
           sum+=block1[i];
242

243
        if((sum&1)==0) block1[63]^=1;
244
}
245
#endif
246

    
247
        for(i=0; i<64; i++)
248
            block_org[i]= block1[i];
249

    
250
        if (form == MMX_PERM) {
251
            for(i=0;i<64;i++)
252
                block[idct_mmx_perm[i]] = block1[i];
253
            } else if (form == MMX_SIMPLE_PERM) {
254
            for(i=0;i<64;i++)
255
                block[idct_simple_mmx_perm[i]] = block1[i];
256

    
257
        } else if (form == SSE2_PERM) {
258
            for(i=0; i<64; i++)
259
                block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i];
260
        } else if (form == PARTTRANS_PERM) {
261
            for(i=0; i<64; i++)
262
                block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i];
263
        } else {
264
            for(i=0; i<64; i++)
265
                block[i]= block1[i];
266
        }
267
#if 0 // simulate mismatch control for tested IDCT but not the ref
268
{ int sum=0;
269
        for(i=0;i<64;i++)
270
           sum+=block[i];
271

272
        if((sum&1)==0) block[63]^=1;
273
}
274
#endif
275

    
276
        fdct_func(block);
277
        mmx_emms();
278

    
279
        if (form == SCALE_PERM) {
280
            for(i=0; i<64; i++) {
281
                scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
282
                block[i] = (block[i] * scale /*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS;
283
            }
284
        }
285

    
286
        fdct_ref(block1);
287

    
288
        blockSumErr=0;
289
        for(i=0;i<64;i++) {
290
            v = abs(block[i] - block1[i]);
291
            if (v > err_inf)
292
                err_inf = v;
293
            err2 += v * v;
294
            sysErr[i] += block[i] - block1[i];
295
            blockSumErr += v;
296
            if( abs(block[i])>maxout) maxout=abs(block[i]);
297
        }
298
        if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr;
299
#if 0 // print different matrix pairs
300
        if(blockSumErr){
301
            printf("\n");
302
            for(i=0; i<64; i++){
303
                if((i&7)==0) printf("\n");
304
                printf("%4d ", block_org[i]);
305
            }
306
            for(i=0; i<64; i++){
307
                if((i&7)==0) printf("\n");
308
                printf("%4d ", block[i] - block1[i]);
309
            }
310
        }
311
#endif
312
    }
313
    for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i]));
314

    
315
#if 1 // dump systematic errors
316
    for(i=0; i<64; i++){
317
        if(i%8==0) printf("\n");
318
        printf("%7d ", (int)sysErr[i]);
319
    }
320
    printf("\n");
321
#endif
322

    
323
    printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
324
           is_idct ? "IDCT" : "DCT",
325
           name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax);
326
#if 1 //Speed test
327
    /* speed test */
328
    for(i=0;i<64;i++)
329
        block1[i] = 0;
330
    switch(test){
331
    case 0:
332
        for(i=0;i<64;i++)
333
            block1[i] = av_lfg_get(&prng) % 512 -256;
334
        if (is_idct){
335
            ff_ref_fdct(block1);
336

    
337
            for(i=0;i<64;i++)
338
                block1[i]>>=3;
339
        }
340
    break;
341
    case 1:{
342
    case 2:
343
        block1[0] = av_lfg_get(&prng) % 512 -256;
344
        block1[1] = av_lfg_get(&prng) % 512 -256;
345
        block1[2] = av_lfg_get(&prng) % 512 -256;
346
        block1[3] = av_lfg_get(&prng) % 512 -256;
347
    }break;
348
    }
349

    
350
    if (form == MMX_PERM) {
351
        for(i=0;i<64;i++)
352
            block[idct_mmx_perm[i]] = block1[i];
353
    } else if(form == MMX_SIMPLE_PERM) {
354
        for(i=0;i<64;i++)
355
            block[idct_simple_mmx_perm[i]] = block1[i];
356
    } else {
357
        for(i=0; i<64; i++)
358
            block[i]= block1[i];
359
    }
360

    
361
    ti = gettime();
362
    it1 = 0;
363
    do {
364
        for(it=0;it<NB_ITS_SPEED;it++) {
365
            for(i=0; i<64; i++)
366
                block[i]= block1[i];
367
//            memcpy(block, block1, sizeof(DCTELEM) * 64);
368
// do not memcpy especially not fastmemcpy because it does movntq !!!
369
            fdct_func(block);
370
        }
371
        it1 += NB_ITS_SPEED;
372
        ti1 = gettime() - ti;
373
    } while (ti1 < 1000000);
374
    mmx_emms();
375

    
376
    printf("%s %s: %0.1f kdct/s\n",
377
           is_idct ? "IDCT" : "DCT",
378
           name, (double)it1 * 1000.0 / (double)ti1);
379
#endif
380
}
381

    
382
DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
383
DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
384

    
385
static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
386
{
387
    static int init;
388
    static double c8[8][8];
389
    static double c4[4][4];
390
    double block1[64], block2[64], block3[64];
391
    double s, sum, v;
392
    int i, j, k;
393

    
394
    if (!init) {
395
        init = 1;
396

    
397
        for(i=0;i<8;i++) {
398
            sum = 0;
399
            for(j=0;j<8;j++) {
400
                s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
401
                c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
402
                sum += c8[i][j] * c8[i][j];
403
            }
404
        }
405

    
406
        for(i=0;i<4;i++) {
407
            sum = 0;
408
            for(j=0;j<4;j++) {
409
                s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
410
                c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
411
                sum += c4[i][j] * c4[i][j];
412
            }
413
        }
414
    }
415

    
416
    /* butterfly */
417
    s = 0.5 * sqrt(2.0);
418
    for(i=0;i<4;i++) {
419
        for(j=0;j<8;j++) {
420
            block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s;
421
            block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s;
422
        }
423
    }
424

    
425
    /* idct8 on lines */
426
    for(i=0;i<8;i++) {
427
        for(j=0;j<8;j++) {
428
            sum = 0;
429
            for(k=0;k<8;k++)
430
                sum += c8[k][j] * block1[8*i+k];
431
            block2[8*i+j] = sum;
432
        }
433
    }
434

    
435
    /* idct4 */
436
    for(i=0;i<8;i++) {
437
        for(j=0;j<4;j++) {
438
            /* top */
439
            sum = 0;
440
            for(k=0;k<4;k++)
441
                sum += c4[k][j] * block2[8*(2*k)+i];
442
            block3[8*(2*j)+i] = sum;
443

    
444
            /* bottom */
445
            sum = 0;
446
            for(k=0;k<4;k++)
447
                sum += c4[k][j] * block2[8*(2*k+1)+i];
448
            block3[8*(2*j+1)+i] = sum;
449
        }
450
    }
451

    
452
    /* clamp and store the result */
453
    for(i=0;i<8;i++) {
454
        for(j=0;j<8;j++) {
455
            v = block3[8*i+j];
456
            if (v < 0)
457
                v = 0;
458
            else if (v > 255)
459
                v = 255;
460
            dest[i * linesize + j] = (int)rint(v);
461
        }
462
    }
463
}
464

    
465
static void idct248_error(const char *name,
466
                    void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block))
467
{
468
    int it, i, it1, ti, ti1, err_max, v;
469

    
470
    AVLFG prng;
471

    
472
    av_lfg_init(&prng, 1);
473

    
474
    /* just one test to see if code is correct (precision is less
475
       important here) */
476
    err_max = 0;
477
    for(it=0;it<NB_ITS;it++) {
478

    
479
        /* XXX: use forward transform to generate values */
480
        for(i=0;i<64;i++)
481
            block1[i] = av_lfg_get(&prng) % 256 - 128;
482
        block1[0] += 1024;
483

    
484
        for(i=0; i<64; i++)
485
            block[i]= block1[i];
486
        idct248_ref(img_dest1, 8, block);
487

    
488
        for(i=0; i<64; i++)
489
            block[i]= block1[i];
490
        idct248_put(img_dest, 8, block);
491

    
492
        for(i=0;i<64;i++) {
493
            v = abs((int)img_dest[i] - (int)img_dest1[i]);
494
            if (v == 255)
495
                printf("%d %d\n", img_dest[i], img_dest1[i]);
496
            if (v > err_max)
497
                err_max = v;
498
        }
499
#if 0
500
        printf("ref=\n");
501
        for(i=0;i<8;i++) {
502
            int j;
503
            for(j=0;j<8;j++) {
504
                printf(" %3d", img_dest1[i*8+j]);
505
            }
506
            printf("\n");
507
        }
508

509
        printf("out=\n");
510
        for(i=0;i<8;i++) {
511
            int j;
512
            for(j=0;j<8;j++) {
513
                printf(" %3d", img_dest[i*8+j]);
514
            }
515
            printf("\n");
516
        }
517
#endif
518
    }
519
    printf("%s %s: err_inf=%d\n",
520
           1 ? "IDCT248" : "DCT248",
521
           name, err_max);
522

    
523
    ti = gettime();
524
    it1 = 0;
525
    do {
526
        for(it=0;it<NB_ITS_SPEED;it++) {
527
            for(i=0; i<64; i++)
528
                block[i]= block1[i];
529
//            memcpy(block, block1, sizeof(DCTELEM) * 64);
530
// do not memcpy especially not fastmemcpy because it does movntq !!!
531
            idct248_put(img_dest, 8, block);
532
        }
533
        it1 += NB_ITS_SPEED;
534
        ti1 = gettime() - ti;
535
    } while (ti1 < 1000000);
536
    mmx_emms();
537

    
538
    printf("%s %s: %0.1f kdct/s\n",
539
           1 ? "IDCT248" : "DCT248",
540
           name, (double)it1 * 1000.0 / (double)ti1);
541
}
542

    
543
static void help(void)
544
{
545
    printf("dct-test [-i] [<test-number>]\n"
546
           "test-number 0 -> test with random matrixes\n"
547
           "            1 -> test with random sparse matrixes\n"
548
           "            2 -> do 3. test from mpeg4 std\n"
549
           "-i          test IDCT implementations\n"
550
           "-4          test IDCT248 implementations\n");
551
}
552

    
553
int main(int argc, char **argv)
554
{
555
    int test_idct = 0, test_248_dct = 0;
556
    int c,i;
557
    int test=1;
558
    cpu_flags = av_get_cpu_flags();
559

    
560
    ff_ref_dct_init();
561
    idct_mmx_init();
562

    
563
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
564
    for(i=0;i<MAX_NEG_CROP;i++) {
565
        cropTbl[i] = 0;
566
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
567
    }
568

    
569
    for(;;) {
570
        c = getopt(argc, argv, "ih4");
571
        if (c == -1)
572
            break;
573
        switch(c) {
574
        case 'i':
575
            test_idct = 1;
576
            break;
577
        case '4':
578
            test_248_dct = 1;
579
            break;
580
        default :
581
        case 'h':
582
            help();
583
            return 0;
584
        }
585
    }
586

    
587
    if(optind <argc) test= atoi(argv[optind]);
588

    
589
    printf("ffmpeg DCT/IDCT test\n");
590

    
591
    if (test_248_dct) {
592
        idct248_error("SIMPLE-C", ff_simple_idct248_put);
593
    } else {
594
      for (i=0;algos[i].name;i++)
595
        if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) {
596
          dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test);
597
        }
598
    }
599
    return 0;
600
}