Statistics
| Branch: | Revision:

ffmpeg / libavcodec / jrevdct.c @ 08842d13

History | View | Annotate | Download (41.1 KB)

1 de6d9b64 Fabrice Bellard
/*
2
 * jrevdct.c
3
 *
4
 * Copyright (C) 1991, 1992, Thomas G. Lane.
5
 * This file is part of the Independent JPEG Group's software.
6
 * For conditions of distribution and use, see the accompanying README file.
7
 *
8
 * This file contains the basic inverse-DCT transformation subroutine.
9
 *
10
 * This implementation is based on an algorithm described in
11
 *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
12
 *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
13
 *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
14
 * The primary algorithm described there uses 11 multiplies and 29 adds.
15
 * We use their alternate method with 12 multiplies and 32 adds.
16
 * The advantage of this method is that no data path contains more than one
17
 * multiplication; this allows a very simple and accurate implementation in
18
 * scaled fixed-point arithmetic, with a minimal number of shifts.
19 115329f1 Diego Biurrun
 *
20 de6d9b64 Fabrice Bellard
 * I've made lots of modifications to attempt to take advantage of the
21
 * sparse nature of the DCT matrices we're getting.  Although the logic
22
 * is cumbersome, it's straightforward and the resulting code is much
23
 * faster.
24
 *
25
 * A better way to do this would be to pass in the DCT block as a sparse
26
 * matrix, perhaps with the difference cases encoded.
27
 */
28 115329f1 Diego Biurrun
29 983e3246 Michael Niedermayer
/**
30
 * @file jrevdct.c
31
 * Independent JPEG Group's LLM idct.
32
 */
33 115329f1 Diego Biurrun
34 de6d9b64 Fabrice Bellard
#include "common.h"
35
#include "dsputil.h"
36
37
#define EIGHT_BIT_SAMPLES
38
39
#define DCTSIZE 8
40
#define DCTSIZE2 64
41
42
#define GLOBAL
43
44
#define RIGHT_SHIFT(x, n) ((x) >> (n))
45
46
typedef DCTELEM DCTBLOCK[DCTSIZE2];
47
48
#define CONST_BITS 13
49
50
/*
51
 * This routine is specialized to the case DCTSIZE = 8.
52
 */
53
54
#if DCTSIZE != 8
55
  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
56
#endif
57
58
59
/*
60
 * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT
61
 * on each column.  Direct algorithms are also available, but they are
62
 * much more complex and seem not to be any faster when reduced to code.
63
 *
64
 * The poop on this scaling stuff is as follows:
65
 *
66
 * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
67
 * larger than the true IDCT outputs.  The final outputs are therefore
68
 * a factor of N larger than desired; since N=8 this can be cured by
69
 * a simple right shift at the end of the algorithm.  The advantage of
70
 * this arrangement is that we save two multiplications per 1-D IDCT,
71
 * because the y0 and y4 inputs need not be divided by sqrt(N).
72
 *
73
 * We have to do addition and subtraction of the integer inputs, which
74
 * is no problem, and multiplication by fractional constants, which is
75
 * a problem to do in integer arithmetic.  We multiply all the constants
76
 * by CONST_SCALE and convert them to integer constants (thus retaining
77
 * CONST_BITS bits of precision in the constants).  After doing a
78
 * multiplication we have to divide the product by CONST_SCALE, with proper
79
 * rounding, to produce the correct output.  This division can be done
80
 * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
81
 * as long as possible so that partial sums can be added together with
82
 * full fractional precision.
83
 *
84
 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
85
 * they are represented to better-than-integral precision.  These outputs
86
 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
87
 * with the recommended scaling.  (To scale up 12-bit sample data further, an
88
 * intermediate int32 array would be needed.)
89
 *
90
 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
91
 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
92
 * shows that the values given below are the most effective.
93
 */
94
95
#ifdef EIGHT_BIT_SAMPLES
96
#define PASS1_BITS  2
97
#else
98 bb270c08 Diego Biurrun
#define PASS1_BITS  1   /* lose a little precision to avoid overflow */
99 de6d9b64 Fabrice Bellard
#endif
100
101 bb270c08 Diego Biurrun
#define ONE         ((int32_t) 1)
102 de6d9b64 Fabrice Bellard
103
#define CONST_SCALE (ONE << CONST_BITS)
104
105
/* Convert a positive real constant to an integer scaled by CONST_SCALE.
106
 * IMPORTANT: if your compiler doesn't do this arithmetic at compile time,
107
 * you will pay a significant penalty in run time.  In that case, figure
108
 * the correct integer constant values and insert them by hand.
109
 */
110
111
/* Actually FIX is no longer used, we precomputed them all */
112 bb270c08 Diego Biurrun
#define FIX(x)  ((int32_t) ((x) * CONST_SCALE + 0.5))
113 de6d9b64 Fabrice Bellard
114 0c1a9eda Zdenek Kabelac
/* Descale and correctly round an int32_t value that's scaled by N bits.
115 de6d9b64 Fabrice Bellard
 * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
116
 * the fudge factor is correct for either sign of X.
117
 */
118
119
#define DESCALE(x,n)  RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
120
121 0c1a9eda Zdenek Kabelac
/* Multiply an int32_t variable by an int32_t constant to yield an int32_t result.
122 de6d9b64 Fabrice Bellard
 * For 8-bit samples with the recommended scaling, all the variable
123
 * and constant values involved are no more than 16 bits wide, so a
124
 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply;
125
 * this provides a useful speedup on many machines.
126
 * There is no way to specify a 16x16->32 multiply in portable C, but
127
 * some C compilers will do the right thing if you provide the correct
128
 * combination of casts.
129
 * NB: for 12-bit samples, a full 32-bit multiplication will be needed.
130
 */
131
132
#ifdef EIGHT_BIT_SAMPLES
133 bb270c08 Diego Biurrun
#ifdef SHORTxSHORT_32           /* may work if 'int' is 32 bits */
134 0c1a9eda Zdenek Kabelac
#define MULTIPLY(var,const)  (((int16_t) (var)) * ((int16_t) (const)))
135 de6d9b64 Fabrice Bellard
#endif
136 bb270c08 Diego Biurrun
#ifdef SHORTxLCONST_32          /* known to work with Microsoft C 6.0 */
137 0c1a9eda Zdenek Kabelac
#define MULTIPLY(var,const)  (((int16_t) (var)) * ((int32_t) (const)))
138 de6d9b64 Fabrice Bellard
#endif
139
#endif
140
141 bb270c08 Diego Biurrun
#ifndef MULTIPLY                /* default definition */
142 de6d9b64 Fabrice Bellard
#define MULTIPLY(var,const)  ((var) * (const))
143
#endif
144
145
146 115329f1 Diego Biurrun
/*
147 de6d9b64 Fabrice Bellard
  Unlike our decoder where we approximate the FIXes, we need to use exact
148 115329f1 Diego Biurrun
ones here or successive P-frames will drift too much with Reference frame coding
149 de6d9b64 Fabrice Bellard
*/
150
#define FIX_0_211164243 1730
151
#define FIX_0_275899380 2260
152
#define FIX_0_298631336 2446
153
#define FIX_0_390180644 3196
154
#define FIX_0_509795579 4176
155
#define FIX_0_541196100 4433
156
#define FIX_0_601344887 4926
157
#define FIX_0_765366865 6270
158
#define FIX_0_785694958 6436
159
#define FIX_0_899976223 7373
160
#define FIX_1_061594337 8697
161
#define FIX_1_111140466 9102
162
#define FIX_1_175875602 9633
163
#define FIX_1_306562965 10703
164
#define FIX_1_387039845 11363
165
#define FIX_1_451774981 11893
166
#define FIX_1_501321110 12299
167
#define FIX_1_662939225 13623
168
#define FIX_1_847759065 15137
169
#define FIX_1_961570560 16069
170
#define FIX_2_053119869 16819
171
#define FIX_2_172734803 17799
172
#define FIX_2_562915447 20995
173
#define FIX_3_072711026 25172
174
175
/*
176
 * Perform the inverse DCT on one block of coefficients.
177
 */
178
179
void j_rev_dct(DCTBLOCK data)
180
{
181 0c1a9eda Zdenek Kabelac
  int32_t tmp0, tmp1, tmp2, tmp3;
182
  int32_t tmp10, tmp11, tmp12, tmp13;
183
  int32_t z1, z2, z3, z4, z5;
184
  int32_t d0, d1, d2, d3, d4, d5, d6, d7;
185 de6d9b64 Fabrice Bellard
  register DCTELEM *dataptr;
186
  int rowctr;
187 115329f1 Diego Biurrun
188 de6d9b64 Fabrice Bellard
  /* Pass 1: process rows. */
189
  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
190
  /* furthermore, we scale the results by 2**PASS1_BITS. */
191
192
  dataptr = data;
193
194
  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
195
    /* Due to quantization, we will usually find that many of the input
196
     * coefficients are zero, especially the AC terms.  We can exploit this
197
     * by short-circuiting the IDCT calculation for any row in which all
198
     * the AC terms are zero.  In that case each output is equal to the
199
     * DC coefficient (with scale factor as needed).
200
     * With typical images and quantization tables, half or more of the
201
     * row DCT calculations can be simplified this way.
202
     */
203
204
    register int *idataptr = (int*)dataptr;
205
206 13b54752 Fabrice Bellard
    /* WARNING: we do the same permutation as MMX idct to simplify the
207
       video core */
208 de6d9b64 Fabrice Bellard
    d0 = dataptr[0];
209 13b54752 Fabrice Bellard
    d2 = dataptr[1];
210
    d4 = dataptr[2];
211
    d6 = dataptr[3];
212
    d1 = dataptr[4];
213
    d3 = dataptr[5];
214
    d5 = dataptr[6];
215 de6d9b64 Fabrice Bellard
    d7 = dataptr[7];
216
217 13b54752 Fabrice Bellard
    if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) {
218 de6d9b64 Fabrice Bellard
      /* AC terms all zero */
219
      if (d0) {
220 bb270c08 Diego Biurrun
          /* Compute a 32 bit value to assign. */
221
          DCTELEM dcval = (DCTELEM) (d0 << PASS1_BITS);
222
          register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000);
223
224
          idataptr[0] = v;
225
          idataptr[1] = v;
226
          idataptr[2] = v;
227
          idataptr[3] = v;
228 de6d9b64 Fabrice Bellard
      }
229 115329f1 Diego Biurrun
230 bb270c08 Diego Biurrun
      dataptr += DCTSIZE;       /* advance pointer to next row */
231 de6d9b64 Fabrice Bellard
      continue;
232
    }
233
234
    /* Even part: reverse the even part of the forward DCT. */
235
    /* The rotator is sqrt(2)*c(-6). */
236
{
237
    if (d6) {
238 bb270c08 Diego Biurrun
            if (d2) {
239
                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
240
                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
241
                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
242
                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
243
244
                    tmp0 = (d0 + d4) << CONST_BITS;
245
                    tmp1 = (d0 - d4) << CONST_BITS;
246
247
                    tmp10 = tmp0 + tmp3;
248
                    tmp13 = tmp0 - tmp3;
249
                    tmp11 = tmp1 + tmp2;
250
                    tmp12 = tmp1 - tmp2;
251
            } else {
252
                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
253
                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
254
                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
255
256
                    tmp0 = (d0 + d4) << CONST_BITS;
257
                    tmp1 = (d0 - d4) << CONST_BITS;
258
259
                    tmp10 = tmp0 + tmp3;
260
                    tmp13 = tmp0 - tmp3;
261
                    tmp11 = tmp1 + tmp2;
262
                    tmp12 = tmp1 - tmp2;
263
            }
264 de6d9b64 Fabrice Bellard
    } else {
265 bb270c08 Diego Biurrun
            if (d2) {
266
                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
267
                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
268
                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
269
270
                    tmp0 = (d0 + d4) << CONST_BITS;
271
                    tmp1 = (d0 - d4) << CONST_BITS;
272
273
                    tmp10 = tmp0 + tmp3;
274
                    tmp13 = tmp0 - tmp3;
275
                    tmp11 = tmp1 + tmp2;
276
                    tmp12 = tmp1 - tmp2;
277
            } else {
278
                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
279
                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
280
                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
281
            }
282 de6d9b64 Fabrice Bellard
      }
283
284
    /* Odd part per figure 8; the matrix is unitary and hence its
285
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
286
     */
287
288
    if (d7) {
289 bb270c08 Diego Biurrun
        if (d5) {
290
            if (d3) {
291
                if (d1) {
292
                    /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
293
                    z1 = d7 + d1;
294
                    z2 = d5 + d3;
295
                    z3 = d7 + d3;
296
                    z4 = d5 + d1;
297
                    z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
298
299
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
300
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
301
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
302
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
303
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
304
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
305
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
306
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
307
308
                    z3 += z5;
309
                    z4 += z5;
310
311
                    tmp0 += z1 + z3;
312
                    tmp1 += z2 + z4;
313
                    tmp2 += z2 + z3;
314
                    tmp3 += z1 + z4;
315
                } else {
316
                    /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
317
                    z2 = d5 + d3;
318
                    z3 = d7 + d3;
319
                    z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
320
321
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
322
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
323
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
324
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
325
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
326
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
327
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
328
329
                    z3 += z5;
330
                    z4 += z5;
331
332
                    tmp0 += z1 + z3;
333
                    tmp1 += z2 + z4;
334
                    tmp2 += z2 + z3;
335
                    tmp3 = z1 + z4;
336
                }
337
            } else {
338
                if (d1) {
339
                    /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
340
                    z1 = d7 + d1;
341
                    z4 = d5 + d1;
342
                    z5 = MULTIPLY(d7 + z4, FIX_1_175875602);
343
344
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
345
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
346
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
347
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
348
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
349
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
350
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
351
352
                    z3 += z5;
353
                    z4 += z5;
354
355
                    tmp0 += z1 + z3;
356
                    tmp1 += z2 + z4;
357
                    tmp2 = z2 + z3;
358
                    tmp3 += z1 + z4;
359
                } else {
360
                    /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
361
                    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
362
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
363
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
364
                    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
365
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
366
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
367
                    z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
368
369
                    z3 += z5;
370
                    z4 += z5;
371
372
                    tmp0 += z3;
373
                    tmp1 += z4;
374
                    tmp2 = z2 + z3;
375
                    tmp3 = z1 + z4;
376
                }
377
            }
378
        } else {
379
            if (d3) {
380
                if (d1) {
381
                    /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
382
                    z1 = d7 + d1;
383
                    z3 = d7 + d3;
384
                    z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
385
386
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
387
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
388
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
389
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
390
                    z2 = MULTIPLY(-d3, FIX_2_562915447);
391
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
392
                    z4 = MULTIPLY(-d1, FIX_0_390180644);
393
394
                    z3 += z5;
395
                    z4 += z5;
396
397
                    tmp0 += z1 + z3;
398
                    tmp1 = z2 + z4;
399
                    tmp2 += z2 + z3;
400
                    tmp3 += z1 + z4;
401
                } else {
402
                    /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
403
                    z3 = d7 + d3;
404
405
                    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
406
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
407
                    tmp2 = MULTIPLY(d3, FIX_0_509795579);
408
                    z2 = MULTIPLY(-d3, FIX_2_562915447);
409
                    z5 = MULTIPLY(z3, FIX_1_175875602);
410
                    z3 = MULTIPLY(-z3, FIX_0_785694958);
411
412
                    tmp0 += z3;
413
                    tmp1 = z2 + z5;
414
                    tmp2 += z3;
415
                    tmp3 = z1 + z5;
416
                }
417
            } else {
418
                if (d1) {
419
                    /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
420
                    z1 = d7 + d1;
421
                    z5 = MULTIPLY(z1, FIX_1_175875602);
422
423
                    z1 = MULTIPLY(z1, FIX_0_275899380);
424
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
425
                    tmp0 = MULTIPLY(-d7, FIX_1_662939225);
426
                    z4 = MULTIPLY(-d1, FIX_0_390180644);
427
                    tmp3 = MULTIPLY(d1, FIX_1_111140466);
428
429
                    tmp0 += z1;
430
                    tmp1 = z4 + z5;
431
                    tmp2 = z3 + z5;
432
                    tmp3 += z1;
433
                } else {
434
                    /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
435
                    tmp0 = MULTIPLY(-d7, FIX_1_387039845);
436
                    tmp1 = MULTIPLY(d7, FIX_1_175875602);
437
                    tmp2 = MULTIPLY(-d7, FIX_0_785694958);
438
                    tmp3 = MULTIPLY(d7, FIX_0_275899380);
439
                }
440
            }
441
        }
442 de6d9b64 Fabrice Bellard
    } else {
443 bb270c08 Diego Biurrun
        if (d5) {
444
            if (d3) {
445
                if (d1) {
446
                    /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
447
                    z2 = d5 + d3;
448
                    z4 = d5 + d1;
449
                    z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
450
451
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
452
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
453
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
454
                    z1 = MULTIPLY(-d1, FIX_0_899976223);
455
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
456
                    z3 = MULTIPLY(-d3, FIX_1_961570560);
457
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
458
459
                    z3 += z5;
460
                    z4 += z5;
461
462
                    tmp0 = z1 + z3;
463
                    tmp1 += z2 + z4;
464
                    tmp2 += z2 + z3;
465
                    tmp3 += z1 + z4;
466
                } else {
467
                    /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
468
                    z2 = d5 + d3;
469
470
                    z5 = MULTIPLY(z2, FIX_1_175875602);
471
                    tmp1 = MULTIPLY(d5, FIX_1_662939225);
472
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
473
                    z2 = MULTIPLY(-z2, FIX_1_387039845);
474
                    tmp2 = MULTIPLY(d3, FIX_1_111140466);
475
                    z3 = MULTIPLY(-d3, FIX_1_961570560);
476
477
                    tmp0 = z3 + z5;
478
                    tmp1 += z2;
479
                    tmp2 += z2;
480
                    tmp3 = z4 + z5;
481
                }
482
            } else {
483
                if (d1) {
484
                    /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
485
                    z4 = d5 + d1;
486
487
                    z5 = MULTIPLY(z4, FIX_1_175875602);
488
                    z1 = MULTIPLY(-d1, FIX_0_899976223);
489
                    tmp3 = MULTIPLY(d1, FIX_0_601344887);
490
                    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
491
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
492
                    z4 = MULTIPLY(z4, FIX_0_785694958);
493
494
                    tmp0 = z1 + z5;
495
                    tmp1 += z4;
496
                    tmp2 = z2 + z5;
497
                    tmp3 += z4;
498
                } else {
499
                    /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
500
                    tmp0 = MULTIPLY(d5, FIX_1_175875602);
501
                    tmp1 = MULTIPLY(d5, FIX_0_275899380);
502
                    tmp2 = MULTIPLY(-d5, FIX_1_387039845);
503
                    tmp3 = MULTIPLY(d5, FIX_0_785694958);
504
                }
505
            }
506
        } else {
507
            if (d3) {
508
                if (d1) {
509
                    /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
510
                    z5 = d1 + d3;
511
                    tmp3 = MULTIPLY(d1, FIX_0_211164243);
512
                    tmp2 = MULTIPLY(-d3, FIX_1_451774981);
513
                    z1 = MULTIPLY(d1, FIX_1_061594337);
514
                    z2 = MULTIPLY(-d3, FIX_2_172734803);
515
                    z4 = MULTIPLY(z5, FIX_0_785694958);
516
                    z5 = MULTIPLY(z5, FIX_1_175875602);
517
518
                    tmp0 = z1 - z4;
519
                    tmp1 = z2 + z4;
520
                    tmp2 += z5;
521
                    tmp3 += z5;
522
                } else {
523
                    /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
524
                    tmp0 = MULTIPLY(-d3, FIX_0_785694958);
525
                    tmp1 = MULTIPLY(-d3, FIX_1_387039845);
526
                    tmp2 = MULTIPLY(-d3, FIX_0_275899380);
527
                    tmp3 = MULTIPLY(d3, FIX_1_175875602);
528
                }
529
            } else {
530
                if (d1) {
531
                    /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
532
                    tmp0 = MULTIPLY(d1, FIX_0_275899380);
533
                    tmp1 = MULTIPLY(d1, FIX_0_785694958);
534
                    tmp2 = MULTIPLY(d1, FIX_1_175875602);
535
                    tmp3 = MULTIPLY(d1, FIX_1_387039845);
536
                } else {
537
                    /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
538
                    tmp0 = tmp1 = tmp2 = tmp3 = 0;
539
                }
540
            }
541
        }
542 de6d9b64 Fabrice Bellard
    }
543
}
544
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
545
546
    dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
547
    dataptr[7] = (DCTELEM) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
548
    dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
549
    dataptr[6] = (DCTELEM) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
550
    dataptr[2] = (DCTELEM) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
551
    dataptr[5] = (DCTELEM) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
552
    dataptr[3] = (DCTELEM) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
553
    dataptr[4] = (DCTELEM) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
554
555 bb270c08 Diego Biurrun
    dataptr += DCTSIZE;         /* advance pointer to next row */
556 de6d9b64 Fabrice Bellard
  }
557
558
  /* Pass 2: process columns. */
559
  /* Note that we must descale the results by a factor of 8 == 2**3, */
560
  /* and also undo the PASS1_BITS scaling. */
561
562
  dataptr = data;
563
  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
564
    /* Columns of zeroes can be exploited in the same way as we did with rows.
565
     * However, the row calculation has created many nonzero AC terms, so the
566
     * simplification applies less often (typically 5% to 10% of the time).
567
     * On machines with very fast multiplication, it's possible that the
568
     * test takes more time than it's worth.  In that case this section
569
     * may be commented out.
570
     */
571
572
    d0 = dataptr[DCTSIZE*0];
573
    d1 = dataptr[DCTSIZE*1];
574
    d2 = dataptr[DCTSIZE*2];
575
    d3 = dataptr[DCTSIZE*3];
576
    d4 = dataptr[DCTSIZE*4];
577
    d5 = dataptr[DCTSIZE*5];
578
    d6 = dataptr[DCTSIZE*6];
579
    d7 = dataptr[DCTSIZE*7];
580
581
    /* Even part: reverse the even part of the forward DCT. */
582
    /* The rotator is sqrt(2)*c(-6). */
583
    if (d6) {
584 bb270c08 Diego Biurrun
            if (d2) {
585
                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
586
                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
587
                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
588
                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
589
590
                    tmp0 = (d0 + d4) << CONST_BITS;
591
                    tmp1 = (d0 - d4) << CONST_BITS;
592
593
                    tmp10 = tmp0 + tmp3;
594
                    tmp13 = tmp0 - tmp3;
595
                    tmp11 = tmp1 + tmp2;
596
                    tmp12 = tmp1 - tmp2;
597
            } else {
598
                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
599
                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
600
                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
601
602
                    tmp0 = (d0 + d4) << CONST_BITS;
603
                    tmp1 = (d0 - d4) << CONST_BITS;
604
605
                    tmp10 = tmp0 + tmp3;
606
                    tmp13 = tmp0 - tmp3;
607
                    tmp11 = tmp1 + tmp2;
608
                    tmp12 = tmp1 - tmp2;
609
            }
610 de6d9b64 Fabrice Bellard
    } else {
611 bb270c08 Diego Biurrun
            if (d2) {
612
                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
613
                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
614
                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
615
616
                    tmp0 = (d0 + d4) << CONST_BITS;
617
                    tmp1 = (d0 - d4) << CONST_BITS;
618
619
                    tmp10 = tmp0 + tmp3;
620
                    tmp13 = tmp0 - tmp3;
621
                    tmp11 = tmp1 + tmp2;
622
                    tmp12 = tmp1 - tmp2;
623
            } else {
624
                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
625
                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
626
                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
627
            }
628 de6d9b64 Fabrice Bellard
    }
629
630
    /* Odd part per figure 8; the matrix is unitary and hence its
631
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
632
     */
633
    if (d7) {
634 bb270c08 Diego Biurrun
        if (d5) {
635
            if (d3) {
636
                if (d1) {
637
                    /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
638
                    z1 = d7 + d1;
639
                    z2 = d5 + d3;
640
                    z3 = d7 + d3;
641
                    z4 = d5 + d1;
642
                    z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
643
644
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
645
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
646
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
647
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
648
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
649
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
650
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
651
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
652
653
                    z3 += z5;
654
                    z4 += z5;
655
656
                    tmp0 += z1 + z3;
657
                    tmp1 += z2 + z4;
658
                    tmp2 += z2 + z3;
659
                    tmp3 += z1 + z4;
660
                } else {
661
                    /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
662
                    z1 = d7;
663
                    z2 = d5 + d3;
664
                    z3 = d7 + d3;
665
                    z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
666
667
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
668
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
669
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
670
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
671
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
672
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
673
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
674
675
                    z3 += z5;
676
                    z4 += z5;
677
678
                    tmp0 += z1 + z3;
679
                    tmp1 += z2 + z4;
680
                    tmp2 += z2 + z3;
681
                    tmp3 = z1 + z4;
682
                }
683
            } else {
684
                if (d1) {
685
                    /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
686
                    z1 = d7 + d1;
687
                    z2 = d5;
688
                    z3 = d7;
689
                    z4 = d5 + d1;
690
                    z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
691
692
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
693
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
694
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
695
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
696
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
697
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
698
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
699
700
                    z3 += z5;
701
                    z4 += z5;
702
703
                    tmp0 += z1 + z3;
704
                    tmp1 += z2 + z4;
705
                    tmp2 = z2 + z3;
706
                    tmp3 += z1 + z4;
707
                } else {
708
                    /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
709
                    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
710
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
711
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
712
                    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
713
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
714
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
715
                    z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
716
717
                    z3 += z5;
718
                    z4 += z5;
719
720
                    tmp0 += z3;
721
                    tmp1 += z4;
722
                    tmp2 = z2 + z3;
723
                    tmp3 = z1 + z4;
724
                }
725
            }
726
        } else {
727
            if (d3) {
728
                if (d1) {
729
                    /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
730
                    z1 = d7 + d1;
731
                    z3 = d7 + d3;
732
                    z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
733
734
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
735
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
736
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
737
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
738
                    z2 = MULTIPLY(-d3, FIX_2_562915447);
739
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
740
                    z4 = MULTIPLY(-d1, FIX_0_390180644);
741
742
                    z3 += z5;
743
                    z4 += z5;
744
745
                    tmp0 += z1 + z3;
746
                    tmp1 = z2 + z4;
747
                    tmp2 += z2 + z3;
748
                    tmp3 += z1 + z4;
749
                } else {
750
                    /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
751
                    z3 = d7 + d3;
752
753
                    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
754
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
755
                    tmp2 = MULTIPLY(d3, FIX_0_509795579);
756
                    z2 = MULTIPLY(-d3, FIX_2_562915447);
757
                    z5 = MULTIPLY(z3, FIX_1_175875602);
758
                    z3 = MULTIPLY(-z3, FIX_0_785694958);
759
760
                    tmp0 += z3;
761
                    tmp1 = z2 + z5;
762
                    tmp2 += z3;
763
                    tmp3 = z1 + z5;
764
                }
765
            } else {
766
                if (d1) {
767
                    /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
768
                    z1 = d7 + d1;
769
                    z5 = MULTIPLY(z1, FIX_1_175875602);
770
771
                    z1 = MULTIPLY(z1, FIX_0_275899380);
772
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
773
                    tmp0 = MULTIPLY(-d7, FIX_1_662939225);
774
                    z4 = MULTIPLY(-d1, FIX_0_390180644);
775
                    tmp3 = MULTIPLY(d1, FIX_1_111140466);
776
777
                    tmp0 += z1;
778
                    tmp1 = z4 + z5;
779
                    tmp2 = z3 + z5;
780
                    tmp3 += z1;
781
                } else {
782
                    /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
783
                    tmp0 = MULTIPLY(-d7, FIX_1_387039845);
784
                    tmp1 = MULTIPLY(d7, FIX_1_175875602);
785
                    tmp2 = MULTIPLY(-d7, FIX_0_785694958);
786
                    tmp3 = MULTIPLY(d7, FIX_0_275899380);
787
                }
788
            }
789
        }
790 de6d9b64 Fabrice Bellard
    } else {
791 bb270c08 Diego Biurrun
        if (d5) {
792
            if (d3) {
793
                if (d1) {
794
                    /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
795
                    z2 = d5 + d3;
796
                    z4 = d5 + d1;
797
                    z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
798
799
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
800
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
801
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
802
                    z1 = MULTIPLY(-d1, FIX_0_899976223);
803
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
804
                    z3 = MULTIPLY(-d3, FIX_1_961570560);
805
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
806
807
                    z3 += z5;
808
                    z4 += z5;
809
810
                    tmp0 = z1 + z3;
811
                    tmp1 += z2 + z4;
812
                    tmp2 += z2 + z3;
813
                    tmp3 += z1 + z4;
814
                } else {
815
                    /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
816
                    z2 = d5 + d3;
817
818
                    z5 = MULTIPLY(z2, FIX_1_175875602);
819
                    tmp1 = MULTIPLY(d5, FIX_1_662939225);
820
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
821
                    z2 = MULTIPLY(-z2, FIX_1_387039845);
822
                    tmp2 = MULTIPLY(d3, FIX_1_111140466);
823
                    z3 = MULTIPLY(-d3, FIX_1_961570560);
824
825
                    tmp0 = z3 + z5;
826
                    tmp1 += z2;
827
                    tmp2 += z2;
828
                    tmp3 = z4 + z5;
829
                }
830
            } else {
831
                if (d1) {
832
                    /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
833
                    z4 = d5 + d1;
834
835
                    z5 = MULTIPLY(z4, FIX_1_175875602);
836
                    z1 = MULTIPLY(-d1, FIX_0_899976223);
837
                    tmp3 = MULTIPLY(d1, FIX_0_601344887);
838
                    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
839
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
840
                    z4 = MULTIPLY(z4, FIX_0_785694958);
841
842
                    tmp0 = z1 + z5;
843
                    tmp1 += z4;
844
                    tmp2 = z2 + z5;
845
                    tmp3 += z4;
846
                } else {
847
                    /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
848
                    tmp0 = MULTIPLY(d5, FIX_1_175875602);
849
                    tmp1 = MULTIPLY(d5, FIX_0_275899380);
850
                    tmp2 = MULTIPLY(-d5, FIX_1_387039845);
851
                    tmp3 = MULTIPLY(d5, FIX_0_785694958);
852
                }
853
            }
854
        } else {
855
            if (d3) {
856
                if (d1) {
857
                    /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
858
                    z5 = d1 + d3;
859
                    tmp3 = MULTIPLY(d1, FIX_0_211164243);
860
                    tmp2 = MULTIPLY(-d3, FIX_1_451774981);
861
                    z1 = MULTIPLY(d1, FIX_1_061594337);
862
                    z2 = MULTIPLY(-d3, FIX_2_172734803);
863
                    z4 = MULTIPLY(z5, FIX_0_785694958);
864
                    z5 = MULTIPLY(z5, FIX_1_175875602);
865
866
                    tmp0 = z1 - z4;
867
                    tmp1 = z2 + z4;
868
                    tmp2 += z5;
869
                    tmp3 += z5;
870
                } else {
871
                    /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
872
                    tmp0 = MULTIPLY(-d3, FIX_0_785694958);
873
                    tmp1 = MULTIPLY(-d3, FIX_1_387039845);
874
                    tmp2 = MULTIPLY(-d3, FIX_0_275899380);
875
                    tmp3 = MULTIPLY(d3, FIX_1_175875602);
876
                }
877
            } else {
878
                if (d1) {
879
                    /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
880
                    tmp0 = MULTIPLY(d1, FIX_0_275899380);
881
                    tmp1 = MULTIPLY(d1, FIX_0_785694958);
882
                    tmp2 = MULTIPLY(d1, FIX_1_175875602);
883
                    tmp3 = MULTIPLY(d1, FIX_1_387039845);
884
                } else {
885
                    /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
886
                    tmp0 = tmp1 = tmp2 = tmp3 = 0;
887
                }
888
            }
889
        }
890 de6d9b64 Fabrice Bellard
    }
891
892
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
893
894
    dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp3,
895 bb270c08 Diego Biurrun
                                           CONST_BITS+PASS1_BITS+3);
896 de6d9b64 Fabrice Bellard
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp10 - tmp3,
897 bb270c08 Diego Biurrun
                                           CONST_BITS+PASS1_BITS+3);
898 de6d9b64 Fabrice Bellard
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp11 + tmp2,
899 bb270c08 Diego Biurrun
                                           CONST_BITS+PASS1_BITS+3);
900 de6d9b64 Fabrice Bellard
    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(tmp11 - tmp2,
901 bb270c08 Diego Biurrun
                                           CONST_BITS+PASS1_BITS+3);
902 de6d9b64 Fabrice Bellard
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp12 + tmp1,
903 bb270c08 Diego Biurrun
                                           CONST_BITS+PASS1_BITS+3);
904 de6d9b64 Fabrice Bellard
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12 - tmp1,
905 bb270c08 Diego Biurrun
                                           CONST_BITS+PASS1_BITS+3);
906 de6d9b64 Fabrice Bellard
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp13 + tmp0,
907 bb270c08 Diego Biurrun
                                           CONST_BITS+PASS1_BITS+3);
908 de6d9b64 Fabrice Bellard
    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp13 - tmp0,
909 bb270c08 Diego Biurrun
                                           CONST_BITS+PASS1_BITS+3);
910 115329f1 Diego Biurrun
911 bb270c08 Diego Biurrun
    dataptr++;                  /* advance pointer to next column */
912 de6d9b64 Fabrice Bellard
  }
913
}
914
915 178fcca8 Michael Niedermayer
#undef DCTSIZE
916
#define DCTSIZE 4
917
#define DCTSTRIDE 8
918
919
void j_rev_dct4(DCTBLOCK data)
920
{
921
  int32_t tmp0, tmp1, tmp2, tmp3;
922
  int32_t tmp10, tmp11, tmp12, tmp13;
923
  int32_t z1;
924
  int32_t d0, d2, d4, d6;
925
  register DCTELEM *dataptr;
926
  int rowctr;
927 affbf043 Michael Niedermayer
928 178fcca8 Michael Niedermayer
  /* Pass 1: process rows. */
929
  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
930
  /* furthermore, we scale the results by 2**PASS1_BITS. */
931
932 affbf043 Michael Niedermayer
  data[0] += 4;
933 115329f1 Diego Biurrun
934 178fcca8 Michael Niedermayer
  dataptr = data;
935
936
  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
937
    /* Due to quantization, we will usually find that many of the input
938
     * coefficients are zero, especially the AC terms.  We can exploit this
939
     * by short-circuiting the IDCT calculation for any row in which all
940
     * the AC terms are zero.  In that case each output is equal to the
941
     * DC coefficient (with scale factor as needed).
942
     * With typical images and quantization tables, half or more of the
943
     * row DCT calculations can be simplified this way.
944
     */
945
946
    register int *idataptr = (int*)dataptr;
947
948
    d0 = dataptr[0];
949
    d2 = dataptr[1];
950
    d4 = dataptr[2];
951
    d6 = dataptr[3];
952
953
    if ((d2 | d4 | d6) == 0) {
954
      /* AC terms all zero */
955
      if (d0) {
956 bb270c08 Diego Biurrun
          /* Compute a 32 bit value to assign. */
957
          DCTELEM dcval = (DCTELEM) (d0 << PASS1_BITS);
958
          register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000);
959 115329f1 Diego Biurrun
960 bb270c08 Diego Biurrun
          idataptr[0] = v;
961
          idataptr[1] = v;
962 178fcca8 Michael Niedermayer
      }
963 115329f1 Diego Biurrun
964 bb270c08 Diego Biurrun
      dataptr += DCTSTRIDE;     /* advance pointer to next row */
965 178fcca8 Michael Niedermayer
      continue;
966
    }
967 115329f1 Diego Biurrun
968 178fcca8 Michael Niedermayer
    /* Even part: reverse the even part of the forward DCT. */
969
    /* The rotator is sqrt(2)*c(-6). */
970
    if (d6) {
971 bb270c08 Diego Biurrun
            if (d2) {
972
                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
973
                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
974
                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
975
                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
976
977
                    tmp0 = (d0 + d4) << CONST_BITS;
978
                    tmp1 = (d0 - d4) << CONST_BITS;
979
980
                    tmp10 = tmp0 + tmp3;
981
                    tmp13 = tmp0 - tmp3;
982
                    tmp11 = tmp1 + tmp2;
983
                    tmp12 = tmp1 - tmp2;
984
            } else {
985
                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
986
                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
987
                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
988
989
                    tmp0 = (d0 + d4) << CONST_BITS;
990
                    tmp1 = (d0 - d4) << CONST_BITS;
991
992
                    tmp10 = tmp0 + tmp3;
993
                    tmp13 = tmp0 - tmp3;
994
                    tmp11 = tmp1 + tmp2;
995
                    tmp12 = tmp1 - tmp2;
996
            }
997 178fcca8 Michael Niedermayer
    } else {
998 bb270c08 Diego Biurrun
            if (d2) {
999
                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1000
                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
1001
                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
1002
1003
                    tmp0 = (d0 + d4) << CONST_BITS;
1004
                    tmp1 = (d0 - d4) << CONST_BITS;
1005
1006
                    tmp10 = tmp0 + tmp3;
1007
                    tmp13 = tmp0 - tmp3;
1008
                    tmp11 = tmp1 + tmp2;
1009
                    tmp12 = tmp1 - tmp2;
1010
            } else {
1011
                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1012
                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1013
                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1014
            }
1015 178fcca8 Michael Niedermayer
      }
1016
1017
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1018
1019
    dataptr[0] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
1020
    dataptr[1] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
1021
    dataptr[2] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
1022
    dataptr[3] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
1023
1024 bb270c08 Diego Biurrun
    dataptr += DCTSTRIDE;       /* advance pointer to next row */
1025 178fcca8 Michael Niedermayer
  }
1026
1027
  /* Pass 2: process columns. */
1028
  /* Note that we must descale the results by a factor of 8 == 2**3, */
1029
  /* and also undo the PASS1_BITS scaling. */
1030
1031
  dataptr = data;
1032
  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
1033
    /* Columns of zeroes can be exploited in the same way as we did with rows.
1034
     * However, the row calculation has created many nonzero AC terms, so the
1035
     * simplification applies less often (typically 5% to 10% of the time).
1036
     * On machines with very fast multiplication, it's possible that the
1037
     * test takes more time than it's worth.  In that case this section
1038
     * may be commented out.
1039
     */
1040
1041
    d0 = dataptr[DCTSTRIDE*0];
1042
    d2 = dataptr[DCTSTRIDE*1];
1043
    d4 = dataptr[DCTSTRIDE*2];
1044
    d6 = dataptr[DCTSTRIDE*3];
1045
1046
    /* Even part: reverse the even part of the forward DCT. */
1047
    /* The rotator is sqrt(2)*c(-6). */
1048
    if (d6) {
1049 bb270c08 Diego Biurrun
            if (d2) {
1050
                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
1051
                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1052
                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1053
                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1054
1055
                    tmp0 = (d0 + d4) << CONST_BITS;
1056
                    tmp1 = (d0 - d4) << CONST_BITS;
1057
1058
                    tmp10 = tmp0 + tmp3;
1059
                    tmp13 = tmp0 - tmp3;
1060
                    tmp11 = tmp1 + tmp2;
1061
                    tmp12 = tmp1 - tmp2;
1062
            } else {
1063
                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1064
                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1065
                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
1066
1067
                    tmp0 = (d0 + d4) << CONST_BITS;
1068
                    tmp1 = (d0 - d4) << CONST_BITS;
1069
1070
                    tmp10 = tmp0 + tmp3;
1071
                    tmp13 = tmp0 - tmp3;
1072
                    tmp11 = tmp1 + tmp2;
1073
                    tmp12 = tmp1 - tmp2;
1074
            }
1075 178fcca8 Michael Niedermayer
    } else {
1076 bb270c08 Diego Biurrun
            if (d2) {
1077
                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1078
                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
1079
                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
1080
1081
                    tmp0 = (d0 + d4) << CONST_BITS;
1082
                    tmp1 = (d0 - d4) << CONST_BITS;
1083
1084
                    tmp10 = tmp0 + tmp3;
1085
                    tmp13 = tmp0 - tmp3;
1086
                    tmp11 = tmp1 + tmp2;
1087
                    tmp12 = tmp1 - tmp2;
1088
            } else {
1089
                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1090
                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1091
                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1092
            }
1093 178fcca8 Michael Niedermayer
    }
1094
1095
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1096
1097 affbf043 Michael Niedermayer
    dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3);
1098
    dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3);
1099
    dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3);
1100
    dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3);
1101 115329f1 Diego Biurrun
1102 bb270c08 Diego Biurrun
    dataptr++;                  /* advance pointer to next column */
1103 178fcca8 Michael Niedermayer
  }
1104
}
1105
1106 9ca358b9 Michael Niedermayer
void j_rev_dct2(DCTBLOCK data){
1107
  int d00, d01, d10, d11;
1108
1109
  data[0] += 4;
1110
  d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE];
1111
  d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE];
1112
  d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE];
1113
  d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE];
1114 115329f1 Diego Biurrun
1115 9ca358b9 Michael Niedermayer
  data[0+0*DCTSTRIDE]= (d00 + d10)>>3;
1116
  data[1+0*DCTSTRIDE]= (d01 + d11)>>3;
1117
  data[0+1*DCTSTRIDE]= (d00 - d10)>>3;
1118
  data[1+1*DCTSTRIDE]= (d01 - d11)>>3;
1119
}
1120 178fcca8 Michael Niedermayer
1121 1aa8c57b Michael Niedermayer
void j_rev_dct1(DCTBLOCK data){
1122
  data[0] = (data[0] + 4)>>3;
1123
}
1124
1125 cd4af68a Zdenek Kabelac
#undef FIX
1126
#undef CONST_BITS