ffmpeg / libavcodec / jrevdct.c @ 8a01fc47
History  View  Annotate  Download (33.2 KB)
1 
/*


2 
* jrevdct.c

3 
*

4 
* Copyright (C) 1991, 1992, Thomas G. Lane.

5 
* This file is part of the Independent JPEG Group's software.

6 
* For conditions of distribution and use, see the accompanying README file.

7 
*

8 
* This file contains the basic inverseDCT transformation subroutine.

9 
*

10 
* This implementation is based on an algorithm described in

11 
* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1D DCT

12 
* Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,

13 
* Speech, and Signal Processing 1989 (ICASSP '89), pp. 988991.

14 
* The primary algorithm described there uses 11 multiplies and 29 adds.

15 
* We use their alternate method with 12 multiplies and 32 adds.

16 
* The advantage of this method is that no data path contains more than one

17 
* multiplication; this allows a very simple and accurate implementation in

18 
* scaled fixedpoint arithmetic, with a minimal number of shifts.

19 
*

20 
* I've made lots of modifications to attempt to take advantage of the

21 
* sparse nature of the DCT matrices we're getting. Although the logic

22 
* is cumbersome, it's straightforward and the resulting code is much

23 
* faster.

24 
*

25 
* A better way to do this would be to pass in the DCT block as a sparse

26 
* matrix, perhaps with the difference cases encoded.

27 
*/

28 

29 
/**

30 
* @file jrevdct.c

31 
* Independent JPEG Group's LLM idct.

32 
*/

33 

34 
#include "common.h" 
35 
#include "dsputil.h" 
36  
37 
#define EIGHT_BIT_SAMPLES

38  
39 
#define DCTSIZE 8 
40 
#define DCTSIZE2 64 
41  
42 
#define GLOBAL

43  
44 
#define RIGHT_SHIFT(x, n) ((x) >> (n))

45  
46 
typedef DCTELEM DCTBLOCK[DCTSIZE2];

47  
48 
#define CONST_BITS 13 
49  
50 
/*

51 
* This routine is specialized to the case DCTSIZE = 8.

52 
*/

53  
54 
#if DCTSIZE != 8 
55 
Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ 
56 
#endif

57  
58  
59 
/*

60 
* A 2D IDCT can be done by 1D IDCT on each row followed by 1D IDCT

61 
* on each column. Direct algorithms are also available, but they are

62 
* much more complex and seem not to be any faster when reduced to code.

63 
*

64 
* The poop on this scaling stuff is as follows:

65 
*

66 
* Each 1D IDCT step produces outputs which are a factor of sqrt(N)

67 
* larger than the true IDCT outputs. The final outputs are therefore

68 
* a factor of N larger than desired; since N=8 this can be cured by

69 
* a simple right shift at the end of the algorithm. The advantage of

70 
* this arrangement is that we save two multiplications per 1D IDCT,

71 
* because the y0 and y4 inputs need not be divided by sqrt(N).

72 
*

73 
* We have to do addition and subtraction of the integer inputs, which

74 
* is no problem, and multiplication by fractional constants, which is

75 
* a problem to do in integer arithmetic. We multiply all the constants

76 
* by CONST_SCALE and convert them to integer constants (thus retaining

77 
* CONST_BITS bits of precision in the constants). After doing a

78 
* multiplication we have to divide the product by CONST_SCALE, with proper

79 
* rounding, to produce the correct output. This division can be done

80 
* cheaply as a right shift of CONST_BITS bits. We postpone shifting

81 
* as long as possible so that partial sums can be added together with

82 
* full fractional precision.

83 
*

84 
* The outputs of the first pass are scaled up by PASS1_BITS bits so that

85 
* they are represented to betterthanintegral precision. These outputs

86 
* require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16bit word

87 
* with the recommended scaling. (To scale up 12bit sample data further, an

88 
* intermediate int32 array would be needed.)

89 
*

90 
* To avoid overflow of the 32bit intermediate results in pass 2, we must

91 
* have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis

92 
* shows that the values given below are the most effective.

93 
*/

94  
95 
#ifdef EIGHT_BIT_SAMPLES

96 
#define PASS1_BITS 2 
97 
#else

98 
#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ 
99 
#endif

100  
101 
#define ONE ((int32_t) 1) 
102  
103 
#define CONST_SCALE (ONE << CONST_BITS)

104  
105 
/* Convert a positive real constant to an integer scaled by CONST_SCALE.

106 
* IMPORTANT: if your compiler doesn't do this arithmetic at compile time,

107 
* you will pay a significant penalty in run time. In that case, figure

108 
* the correct integer constant values and insert them by hand.

109 
*/

110  
111 
/* Actually FIX is no longer used, we precomputed them all */

112 
#define FIX(x) ((int32_t) ((x) * CONST_SCALE + 0.5)) 
113  
114 
/* Descale and correctly round an int32_t value that's scaled by N bits.

115 
* We assume RIGHT_SHIFT rounds towards minus infinity, so adding

116 
* the fudge factor is correct for either sign of X.

117 
*/

118  
119 
#define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)1)), n) 
120  
121 
/* Multiply an int32_t variable by an int32_t constant to yield an int32_t result.

122 
* For 8bit samples with the recommended scaling, all the variable

123 
* and constant values involved are no more than 16 bits wide, so a

124 
* 16x16>32 bit multiply can be used instead of a full 32x32 multiply;

125 
* this provides a useful speedup on many machines.

126 
* There is no way to specify a 16x16>32 multiply in portable C, but

127 
* some C compilers will do the right thing if you provide the correct

128 
* combination of casts.

129 
* NB: for 12bit samples, a full 32bit multiplication will be needed.

130 
*/

131  
132 
#ifdef EIGHT_BIT_SAMPLES

133 
#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */ 
134 
#define MULTIPLY(var,const) (((int16_t) (var)) * ((int16_t) (const))) 
135 
#endif

136 
#ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */ 
137 
#define MULTIPLY(var,const) (((int16_t) (var)) * ((int32_t) (const))) 
138 
#endif

139 
#endif

140  
141 
#ifndef MULTIPLY /* default definition */ 
142 
#define MULTIPLY(var,const) ((var) * (const)) 
143 
#endif

144  
145  
146 
/*

147 
Unlike our decoder where we approximate the FIXes, we need to use exact

148 
ones here or successive Pframes will drift too much with Reference frame coding

149 
*/

150 
#define FIX_0_211164243 1730 
151 
#define FIX_0_275899380 2260 
152 
#define FIX_0_298631336 2446 
153 
#define FIX_0_390180644 3196 
154 
#define FIX_0_509795579 4176 
155 
#define FIX_0_541196100 4433 
156 
#define FIX_0_601344887 4926 
157 
#define FIX_0_765366865 6270 
158 
#define FIX_0_785694958 6436 
159 
#define FIX_0_899976223 7373 
160 
#define FIX_1_061594337 8697 
161 
#define FIX_1_111140466 9102 
162 
#define FIX_1_175875602 9633 
163 
#define FIX_1_306562965 10703 
164 
#define FIX_1_387039845 11363 
165 
#define FIX_1_451774981 11893 
166 
#define FIX_1_501321110 12299 
167 
#define FIX_1_662939225 13623 
168 
#define FIX_1_847759065 15137 
169 
#define FIX_1_961570560 16069 
170 
#define FIX_2_053119869 16819 
171 
#define FIX_2_172734803 17799 
172 
#define FIX_2_562915447 20995 
173 
#define FIX_3_072711026 25172 
174  
175 
/*

176 
* Perform the inverse DCT on one block of coefficients.

177 
*/

178  
179 
void j_rev_dct(DCTBLOCK data)

180 
{ 
181 
int32_t tmp0, tmp1, tmp2, tmp3; 
182 
int32_t tmp10, tmp11, tmp12, tmp13; 
183 
int32_t z1, z2, z3, z4, z5; 
184 
int32_t d0, d1, d2, d3, d4, d5, d6, d7; 
185 
register DCTELEM *dataptr;

186 
int rowctr;

187 

188 
/* Pass 1: process rows. */

189 
/* Note results are scaled up by sqrt(8) compared to a true IDCT; */

190 
/* furthermore, we scale the results by 2**PASS1_BITS. */

191  
192 
dataptr = data; 
193  
194 
for (rowctr = DCTSIZE1; rowctr >= 0; rowctr) { 
195 
/* Due to quantization, we will usually find that many of the input

196 
* coefficients are zero, especially the AC terms. We can exploit this

197 
* by shortcircuiting the IDCT calculation for any row in which all

198 
* the AC terms are zero. In that case each output is equal to the

199 
* DC coefficient (with scale factor as needed).

200 
* With typical images and quantization tables, half or more of the

201 
* row DCT calculations can be simplified this way.

202 
*/

203  
204 
register int *idataptr = (int*)dataptr; 
205  
206 
/* WARNING: we do the same permutation as MMX idct to simplify the

207 
video core */

208 
d0 = dataptr[0];

209 
d2 = dataptr[1];

210 
d4 = dataptr[2];

211 
d6 = dataptr[3];

212 
d1 = dataptr[4];

213 
d3 = dataptr[5];

214 
d5 = dataptr[6];

215 
d7 = dataptr[7];

216  
217 
if ((d1  d2  d3  d4  d5  d6  d7) == 0) { 
218 
/* AC terms all zero */

219 
if (d0) {

220 
/* Compute a 32 bit value to assign. */

221 
DCTELEM dcval = (DCTELEM) (d0 << PASS1_BITS); 
222 
register int v = (dcval & 0xffff)  ((dcval << 16) & 0xffff0000); 
223 

224 
idataptr[0] = v;

225 
idataptr[1] = v;

226 
idataptr[2] = v;

227 
idataptr[3] = v;

228 
} 
229 

230 
dataptr += DCTSIZE; /* advance pointer to next row */

231 
continue;

232 
} 
233  
234 
/* Even part: reverse the even part of the forward DCT. */

235 
/* The rotator is sqrt(2)*c(6). */

236 
{ 
237 
if (d6) {

238 
if (d2) {

239 
/* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */

240 
z1 = MULTIPLY(d2 + d6, FIX_0_541196100); 
241 
tmp2 = z1 + MULTIPLY(d6, FIX_1_847759065); 
242 
tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); 
243  
244 
tmp0 = (d0 + d4) << CONST_BITS; 
245 
tmp1 = (d0  d4) << CONST_BITS; 
246  
247 
tmp10 = tmp0 + tmp3; 
248 
tmp13 = tmp0  tmp3; 
249 
tmp11 = tmp1 + tmp2; 
250 
tmp12 = tmp1  tmp2; 
251 
} else {

252 
/* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */

253 
tmp2 = MULTIPLY(d6, FIX_1_306562965); 
254 
tmp3 = MULTIPLY(d6, FIX_0_541196100); 
255  
256 
tmp0 = (d0 + d4) << CONST_BITS; 
257 
tmp1 = (d0  d4) << CONST_BITS; 
258  
259 
tmp10 = tmp0 + tmp3; 
260 
tmp13 = tmp0  tmp3; 
261 
tmp11 = tmp1 + tmp2; 
262 
tmp12 = tmp1  tmp2; 
263 
} 
264 
} else {

265 
if (d2) {

266 
/* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */

267 
tmp2 = MULTIPLY(d2, FIX_0_541196100); 
268 
tmp3 = MULTIPLY(d2, FIX_1_306562965); 
269  
270 
tmp0 = (d0 + d4) << CONST_BITS; 
271 
tmp1 = (d0  d4) << CONST_BITS; 
272  
273 
tmp10 = tmp0 + tmp3; 
274 
tmp13 = tmp0  tmp3; 
275 
tmp11 = tmp1 + tmp2; 
276 
tmp12 = tmp1  tmp2; 
277 
} else {

278 
/* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */

279 
tmp10 = tmp13 = (d0 + d4) << CONST_BITS; 
280 
tmp11 = tmp12 = (d0  d4) << CONST_BITS; 
281 
} 
282 
} 
283  
284 
/* Odd part per figure 8; the matrix is unitary and hence its

285 
* transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.

286 
*/

287  
288 
if (d7) {

289 
if (d5) {

290 
if (d3) {

291 
if (d1) {

292 
/* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */

293 
z1 = d7 + d1; 
294 
z2 = d5 + d3; 
295 
z3 = d7 + d3; 
296 
z4 = d5 + d1; 
297 
z5 = MULTIPLY(z3 + z4, FIX_1_175875602); 
298 

299 
tmp0 = MULTIPLY(d7, FIX_0_298631336); 
300 
tmp1 = MULTIPLY(d5, FIX_2_053119869); 
301 
tmp2 = MULTIPLY(d3, FIX_3_072711026); 
302 
tmp3 = MULTIPLY(d1, FIX_1_501321110); 
303 
z1 = MULTIPLY(z1, FIX_0_899976223); 
304 
z2 = MULTIPLY(z2, FIX_2_562915447); 
305 
z3 = MULTIPLY(z3, FIX_1_961570560); 
306 
z4 = MULTIPLY(z4, FIX_0_390180644); 
307 

308 
z3 += z5; 
309 
z4 += z5; 
310 

311 
tmp0 += z1 + z3; 
312 
tmp1 += z2 + z4; 
313 
tmp2 += z2 + z3; 
314 
tmp3 += z1 + z4; 
315 
} else {

316 
/* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */

317 
z2 = d5 + d3; 
318 
z3 = d7 + d3; 
319 
z5 = MULTIPLY(z3 + d5, FIX_1_175875602); 
320 

321 
tmp0 = MULTIPLY(d7, FIX_0_298631336); 
322 
tmp1 = MULTIPLY(d5, FIX_2_053119869); 
323 
tmp2 = MULTIPLY(d3, FIX_3_072711026); 
324 
z1 = MULTIPLY(d7, FIX_0_899976223); 
325 
z2 = MULTIPLY(z2, FIX_2_562915447); 
326 
z3 = MULTIPLY(z3, FIX_1_961570560); 
327 
z4 = MULTIPLY(d5, FIX_0_390180644); 
328 

329 
z3 += z5; 
330 
z4 += z5; 
331 

332 
tmp0 += z1 + z3; 
333 
tmp1 += z2 + z4; 
334 
tmp2 += z2 + z3; 
335 
tmp3 = z1 + z4; 
336 
} 
337 
} else {

338 
if (d1) {

339 
/* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */

340 
z1 = d7 + d1; 
341 
z4 = d5 + d1; 
342 
z5 = MULTIPLY(d7 + z4, FIX_1_175875602); 
343 

344 
tmp0 = MULTIPLY(d7, FIX_0_298631336); 
345 
tmp1 = MULTIPLY(d5, FIX_2_053119869); 
346 
tmp3 = MULTIPLY(d1, FIX_1_501321110); 
347 
z1 = MULTIPLY(z1, FIX_0_899976223); 
348 
z2 = MULTIPLY(d5, FIX_2_562915447); 
349 
z3 = MULTIPLY(d7, FIX_1_961570560); 
350 
z4 = MULTIPLY(z4, FIX_0_390180644); 
351 

352 
z3 += z5; 
353 
z4 += z5; 
354 

355 
tmp0 += z1 + z3; 
356 
tmp1 += z2 + z4; 
357 
tmp2 = z2 + z3; 
358 
tmp3 += z1 + z4; 
359 
} else {

360 
/* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */

361 
tmp0 = MULTIPLY(d7, FIX_0_601344887); 
362 
z1 = MULTIPLY(d7, FIX_0_899976223); 
363 
z3 = MULTIPLY(d7, FIX_1_961570560); 
364 
tmp1 = MULTIPLY(d5, FIX_0_509795579); 
365 
z2 = MULTIPLY(d5, FIX_2_562915447); 
366 
z4 = MULTIPLY(d5, FIX_0_390180644); 
367 
z5 = MULTIPLY(d5 + d7, FIX_1_175875602); 
368 

369 
z3 += z5; 
370 
z4 += z5; 
371 

372 
tmp0 += z3; 
373 
tmp1 += z4; 
374 
tmp2 = z2 + z3; 
375 
tmp3 = z1 + z4; 
376 
} 
377 
} 
378 
} else {

379 
if (d3) {

380 
if (d1) {

381 
/* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */

382 
z1 = d7 + d1; 
383 
z3 = d7 + d3; 
384 
z5 = MULTIPLY(z3 + d1, FIX_1_175875602); 
385 

386 
tmp0 = MULTIPLY(d7, FIX_0_298631336); 
387 
tmp2 = MULTIPLY(d3, FIX_3_072711026); 
388 
tmp3 = MULTIPLY(d1, FIX_1_501321110); 
389 
z1 = MULTIPLY(z1, FIX_0_899976223); 
390 
z2 = MULTIPLY(d3, FIX_2_562915447); 
391 
z3 = MULTIPLY(z3, FIX_1_961570560); 
392 
z4 = MULTIPLY(d1, FIX_0_390180644); 
393 

394 
z3 += z5; 
395 
z4 += z5; 
396 

397 
tmp0 += z1 + z3; 
398 
tmp1 = z2 + z4; 
399 
tmp2 += z2 + z3; 
400 
tmp3 += z1 + z4; 
401 
} else {

402 
/* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */

403 
z3 = d7 + d3; 
404 

405 
tmp0 = MULTIPLY(d7, FIX_0_601344887); 
406 
z1 = MULTIPLY(d7, FIX_0_899976223); 
407 
tmp2 = MULTIPLY(d3, FIX_0_509795579); 
408 
z2 = MULTIPLY(d3, FIX_2_562915447); 
409 
z5 = MULTIPLY(z3, FIX_1_175875602); 
410 
z3 = MULTIPLY(z3, FIX_0_785694958); 
411 

412 
tmp0 += z3; 
413 
tmp1 = z2 + z5; 
414 
tmp2 += z3; 
415 
tmp3 = z1 + z5; 
416 
} 
417 
} else {

418 
if (d1) {

419 
/* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */

420 
z1 = d7 + d1; 
421 
z5 = MULTIPLY(z1, FIX_1_175875602); 
422  
423 
z1 = MULTIPLY(z1, FIX_0_275899380); 
424 
z3 = MULTIPLY(d7, FIX_1_961570560); 
425 
tmp0 = MULTIPLY(d7, FIX_1_662939225); 
426 
z4 = MULTIPLY(d1, FIX_0_390180644); 
427 
tmp3 = MULTIPLY(d1, FIX_1_111140466); 
428  
429 
tmp0 += z1; 
430 
tmp1 = z4 + z5; 
431 
tmp2 = z3 + z5; 
432 
tmp3 += z1; 
433 
} else {

434 
/* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */

435 
tmp0 = MULTIPLY(d7, FIX_1_387039845); 
436 
tmp1 = MULTIPLY(d7, FIX_1_175875602); 
437 
tmp2 = MULTIPLY(d7, FIX_0_785694958); 
438 
tmp3 = MULTIPLY(d7, FIX_0_275899380); 
439 
} 
440 
} 
441 
} 
442 
} else {

443 
if (d5) {

444 
if (d3) {

445 
if (d1) {

446 
/* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */

447 
z2 = d5 + d3; 
448 
z4 = d5 + d1; 
449 
z5 = MULTIPLY(d3 + z4, FIX_1_175875602); 
450 

451 
tmp1 = MULTIPLY(d5, FIX_2_053119869); 
452 
tmp2 = MULTIPLY(d3, FIX_3_072711026); 
453 
tmp3 = MULTIPLY(d1, FIX_1_501321110); 
454 
z1 = MULTIPLY(d1, FIX_0_899976223); 
455 
z2 = MULTIPLY(z2, FIX_2_562915447); 
456 
z3 = MULTIPLY(d3, FIX_1_961570560); 
457 
z4 = MULTIPLY(z4, FIX_0_390180644); 
458 

459 
z3 += z5; 
460 
z4 += z5; 
461 

462 
tmp0 = z1 + z3; 
463 
tmp1 += z2 + z4; 
464 
tmp2 += z2 + z3; 
465 
tmp3 += z1 + z4; 
466 
} else {

467 
/* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */

468 
z2 = d5 + d3; 
469 

470 
z5 = MULTIPLY(z2, FIX_1_175875602); 
471 
tmp1 = MULTIPLY(d5, FIX_1_662939225); 
472 
z4 = MULTIPLY(d5, FIX_0_390180644); 
473 
z2 = MULTIPLY(z2, FIX_1_387039845); 
474 
tmp2 = MULTIPLY(d3, FIX_1_111140466); 
475 
z3 = MULTIPLY(d3, FIX_1_961570560); 
476 

477 
tmp0 = z3 + z5; 
478 
tmp1 += z2; 
479 
tmp2 += z2; 
480 
tmp3 = z4 + z5; 
481 
} 
482 
} else {

483 
if (d1) {

484 
/* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */

485 
z4 = d5 + d1; 
486 

487 
z5 = MULTIPLY(z4, FIX_1_175875602); 
488 
z1 = MULTIPLY(d1, FIX_0_899976223); 
489 
tmp3 = MULTIPLY(d1, FIX_0_601344887); 
490 
tmp1 = MULTIPLY(d5, FIX_0_509795579); 
491 
z2 = MULTIPLY(d5, FIX_2_562915447); 
492 
z4 = MULTIPLY(z4, FIX_0_785694958); 
493 

494 
tmp0 = z1 + z5; 
495 
tmp1 += z4; 
496 
tmp2 = z2 + z5; 
497 
tmp3 += z4; 
498 
} else {

499 
/* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */

500 
tmp0 = MULTIPLY(d5, FIX_1_175875602); 
501 
tmp1 = MULTIPLY(d5, FIX_0_275899380); 
502 
tmp2 = MULTIPLY(d5, FIX_1_387039845); 
503 
tmp3 = MULTIPLY(d5, FIX_0_785694958); 
504 
} 
505 
} 
506 
} else {

507 
if (d3) {

508 
if (d1) {

509 
/* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */

510 
z5 = d1 + d3; 
511 
tmp3 = MULTIPLY(d1, FIX_0_211164243); 
512 
tmp2 = MULTIPLY(d3, FIX_1_451774981); 
513 
z1 = MULTIPLY(d1, FIX_1_061594337); 
514 
z2 = MULTIPLY(d3, FIX_2_172734803); 
515 
z4 = MULTIPLY(z5, FIX_0_785694958); 
516 
z5 = MULTIPLY(z5, FIX_1_175875602); 
517 

518 
tmp0 = z1  z4; 
519 
tmp1 = z2 + z4; 
520 
tmp2 += z5; 
521 
tmp3 += z5; 
522 
} else {

523 
/* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */

524 
tmp0 = MULTIPLY(d3, FIX_0_785694958); 
525 
tmp1 = MULTIPLY(d3, FIX_1_387039845); 
526 
tmp2 = MULTIPLY(d3, FIX_0_275899380); 
527 
tmp3 = MULTIPLY(d3, FIX_1_175875602); 
528 
} 
529 
} else {

530 
if (d1) {

531 
/* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */

532 
tmp0 = MULTIPLY(d1, FIX_0_275899380); 
533 
tmp1 = MULTIPLY(d1, FIX_0_785694958); 
534 
tmp2 = MULTIPLY(d1, FIX_1_175875602); 
535 
tmp3 = MULTIPLY(d1, FIX_1_387039845); 
536 
} else {

537 
/* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */

538 
tmp0 = tmp1 = tmp2 = tmp3 = 0;

539 
} 
540 
} 
541 
} 
542 
} 
543 
} 
544 
/* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */

545  
546 
dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp3, CONST_BITSPASS1_BITS);

547 
dataptr[7] = (DCTELEM) DESCALE(tmp10  tmp3, CONST_BITSPASS1_BITS);

548 
dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp2, CONST_BITSPASS1_BITS);

549 
dataptr[6] = (DCTELEM) DESCALE(tmp11  tmp2, CONST_BITSPASS1_BITS);

550 
dataptr[2] = (DCTELEM) DESCALE(tmp12 + tmp1, CONST_BITSPASS1_BITS);

551 
dataptr[5] = (DCTELEM) DESCALE(tmp12  tmp1, CONST_BITSPASS1_BITS);

552 
dataptr[3] = (DCTELEM) DESCALE(tmp13 + tmp0, CONST_BITSPASS1_BITS);

553 
dataptr[4] = (DCTELEM) DESCALE(tmp13  tmp0, CONST_BITSPASS1_BITS);

554  
555 
dataptr += DCTSIZE; /* advance pointer to next row */

556 
} 
557  
558 
/* Pass 2: process columns. */

559 
/* Note that we must descale the results by a factor of 8 == 2**3, */

560 
/* and also undo the PASS1_BITS scaling. */

561  
562 
dataptr = data; 
563 
for (rowctr = DCTSIZE1; rowctr >= 0; rowctr) { 
564 
/* Columns of zeroes can be exploited in the same way as we did with rows.

565 
* However, the row calculation has created many nonzero AC terms, so the

566 
* simplification applies less often (typically 5% to 10% of the time).

567 
* On machines with very fast multiplication, it's possible that the

568 
* test takes more time than it's worth. In that case this section

569 
* may be commented out.

570 
*/

571  
572 
d0 = dataptr[DCTSIZE*0];

573 
d1 = dataptr[DCTSIZE*1];

574 
d2 = dataptr[DCTSIZE*2];

575 
d3 = dataptr[DCTSIZE*3];

576 
d4 = dataptr[DCTSIZE*4];

577 
d5 = dataptr[DCTSIZE*5];

578 
d6 = dataptr[DCTSIZE*6];

579 
d7 = dataptr[DCTSIZE*7];

580  
581 
/* Even part: reverse the even part of the forward DCT. */

582 
/* The rotator is sqrt(2)*c(6). */

583 
if (d6) {

584 
if (d2) {

585 
/* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */

586 
z1 = MULTIPLY(d2 + d6, FIX_0_541196100); 
587 
tmp2 = z1 + MULTIPLY(d6, FIX_1_847759065); 
588 
tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); 
589  
590 
tmp0 = (d0 + d4) << CONST_BITS; 
591 
tmp1 = (d0  d4) << CONST_BITS; 
592  
593 
tmp10 = tmp0 + tmp3; 
594 
tmp13 = tmp0  tmp3; 
595 
tmp11 = tmp1 + tmp2; 
596 
tmp12 = tmp1  tmp2; 
597 
} else {

598 
/* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */

599 
tmp2 = MULTIPLY(d6, FIX_1_306562965); 
600 
tmp3 = MULTIPLY(d6, FIX_0_541196100); 
601  
602 
tmp0 = (d0 + d4) << CONST_BITS; 
603 
tmp1 = (d0  d4) << CONST_BITS; 
604  
605 
tmp10 = tmp0 + tmp3; 
606 
tmp13 = tmp0  tmp3; 
607 
tmp11 = tmp1 + tmp2; 
608 
tmp12 = tmp1  tmp2; 
609 
} 
610 
} else {

611 
if (d2) {

612 
/* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */

613 
tmp2 = MULTIPLY(d2, FIX_0_541196100); 
614 
tmp3 = MULTIPLY(d2, FIX_1_306562965); 
615  
616 
tmp0 = (d0 + d4) << CONST_BITS; 
617 
tmp1 = (d0  d4) << CONST_BITS; 
618  
619 
tmp10 = tmp0 + tmp3; 
620 
tmp13 = tmp0  tmp3; 
621 
tmp11 = tmp1 + tmp2; 
622 
tmp12 = tmp1  tmp2; 
623 
} else {

624 
/* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */

625 
tmp10 = tmp13 = (d0 + d4) << CONST_BITS; 
626 
tmp11 = tmp12 = (d0  d4) << CONST_BITS; 
627 
} 
628 
} 
629  
630 
/* Odd part per figure 8; the matrix is unitary and hence its

631 
* transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.

632 
*/

633 
if (d7) {

634 
if (d5) {

635 
if (d3) {

636 
if (d1) {

637 
/* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */

638 
z1 = d7 + d1; 
639 
z2 = d5 + d3; 
640 
z3 = d7 + d3; 
641 
z4 = d5 + d1; 
642 
z5 = MULTIPLY(z3 + z4, FIX_1_175875602); 
643 

644 
tmp0 = MULTIPLY(d7, FIX_0_298631336); 
645 
tmp1 = MULTIPLY(d5, FIX_2_053119869); 
646 
tmp2 = MULTIPLY(d3, FIX_3_072711026); 
647 
tmp3 = MULTIPLY(d1, FIX_1_501321110); 
648 
z1 = MULTIPLY(z1, FIX_0_899976223); 
649 
z2 = MULTIPLY(z2, FIX_2_562915447); 
650 
z3 = MULTIPLY(z3, FIX_1_961570560); 
651 
z4 = MULTIPLY(z4, FIX_0_390180644); 
652 

653 
z3 += z5; 
654 
z4 += z5; 
655 

656 
tmp0 += z1 + z3; 
657 
tmp1 += z2 + z4; 
658 
tmp2 += z2 + z3; 
659 
tmp3 += z1 + z4; 
660 
} else {

661 
/* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */

662 
z1 = d7; 
663 
z2 = d5 + d3; 
664 
z3 = d7 + d3; 
665 
z5 = MULTIPLY(z3 + d5, FIX_1_175875602); 
666 

667 
tmp0 = MULTIPLY(d7, FIX_0_298631336); 
668 
tmp1 = MULTIPLY(d5, FIX_2_053119869); 
669 
tmp2 = MULTIPLY(d3, FIX_3_072711026); 
670 
z1 = MULTIPLY(d7, FIX_0_899976223); 
671 
z2 = MULTIPLY(z2, FIX_2_562915447); 
672 
z3 = MULTIPLY(z3, FIX_1_961570560); 
673 
z4 = MULTIPLY(d5, FIX_0_390180644); 
674 

675 
z3 += z5; 
676 
z4 += z5; 
677 

678 
tmp0 += z1 + z3; 
679 
tmp1 += z2 + z4; 
680 
tmp2 += z2 + z3; 
681 
tmp3 = z1 + z4; 
682 
} 
683 
} else {

684 
if (d1) {

685 
/* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */

686 
z1 = d7 + d1; 
687 
z2 = d5; 
688 
z3 = d7; 
689 
z4 = d5 + d1; 
690 
z5 = MULTIPLY(z3 + z4, FIX_1_175875602); 
691 

692 
tmp0 = MULTIPLY(d7, FIX_0_298631336); 
693 
tmp1 = MULTIPLY(d5, FIX_2_053119869); 
694 
tmp3 = MULTIPLY(d1, FIX_1_501321110); 
695 
z1 = MULTIPLY(z1, FIX_0_899976223); 
696 
z2 = MULTIPLY(d5, FIX_2_562915447); 
697 
z3 = MULTIPLY(d7, FIX_1_961570560); 
698 
z4 = MULTIPLY(z4, FIX_0_390180644); 
699 

700 
z3 += z5; 
701 
z4 += z5; 
702 

703 
tmp0 += z1 + z3; 
704 
tmp1 += z2 + z4; 
705 
tmp2 = z2 + z3; 
706 
tmp3 += z1 + z4; 
707 
} else {

708 
/* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */

709 
tmp0 = MULTIPLY(d7, FIX_0_601344887); 
710 
z1 = MULTIPLY(d7, FIX_0_899976223); 
711 
z3 = MULTIPLY(d7, FIX_1_961570560); 
712 
tmp1 = MULTIPLY(d5, FIX_0_509795579); 
713 
z2 = MULTIPLY(d5, FIX_2_562915447); 
714 
z4 = MULTIPLY(d5, FIX_0_390180644); 
715 
z5 = MULTIPLY(d5 + d7, FIX_1_175875602); 
716 

717 
z3 += z5; 
718 
z4 += z5; 
719 

720 
tmp0 += z3; 
721 
tmp1 += z4; 
722 
tmp2 = z2 + z3; 
723 
tmp3 = z1 + z4; 
724 
} 
725 
} 
726 
} else {

727 
if (d3) {

728 
if (d1) {

729 
/* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */

730 
z1 = d7 + d1; 
731 
z3 = d7 + d3; 
732 
z5 = MULTIPLY(z3 + d1, FIX_1_175875602); 
733 

734 
tmp0 = MULTIPLY(d7, FIX_0_298631336); 
735 
tmp2 = MULTIPLY(d3, FIX_3_072711026); 
736 
tmp3 = MULTIPLY(d1, FIX_1_501321110); 
737 
z1 = MULTIPLY(z1, FIX_0_899976223); 
738 
z2 = MULTIPLY(d3, FIX_2_562915447); 
739 
z3 = MULTIPLY(z3, FIX_1_961570560); 
740 
z4 = MULTIPLY(d1, FIX_0_390180644); 
741 

742 
z3 += z5; 
743 
z4 += z5; 
744 

745 
tmp0 += z1 + z3; 
746 
tmp1 = z2 + z4; 
747 
tmp2 += z2 + z3; 
748 
tmp3 += z1 + z4; 
749 
} else {

750 
/* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */

751 
z3 = d7 + d3; 
752 

753 
tmp0 = MULTIPLY(d7, FIX_0_601344887); 
754 
z1 = MULTIPLY(d7, FIX_0_899976223); 
755 
tmp2 = MULTIPLY(d3, FIX_0_509795579); 
756 
z2 = MULTIPLY(d3, FIX_2_562915447); 
757 
z5 = MULTIPLY(z3, FIX_1_175875602); 
758 
z3 = MULTIPLY(z3, FIX_0_785694958); 
759 

760 
tmp0 += z3; 
761 
tmp1 = z2 + z5; 
762 
tmp2 += z3; 
763 
tmp3 = z1 + z5; 
764 
} 
765 
} else {

766 
if (d1) {

767 
/* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */

768 
z1 = d7 + d1; 
769 
z5 = MULTIPLY(z1, FIX_1_175875602); 
770  
771 
z1 = MULTIPLY(z1, FIX_0_275899380); 
772 
z3 = MULTIPLY(d7, FIX_1_961570560); 
773 
tmp0 = MULTIPLY(d7, FIX_1_662939225); 
774 
z4 = MULTIPLY(d1, FIX_0_390180644); 
775 
tmp3 = MULTIPLY(d1, FIX_1_111140466); 
776  
777 
tmp0 += z1; 
778 
tmp1 = z4 + z5; 
779 
tmp2 = z3 + z5; 
780 
tmp3 += z1; 
781 
} else {

782 
/* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */

783 
tmp0 = MULTIPLY(d7, FIX_1_387039845); 
784 
tmp1 = MULTIPLY(d7, FIX_1_175875602); 
785 
tmp2 = MULTIPLY(d7, FIX_0_785694958); 
786 
tmp3 = MULTIPLY(d7, FIX_0_275899380); 
787 
} 
788 
} 
789 
} 
790 
} else {

791 
if (d5) {

792 
if (d3) {

793 
if (d1) {

794 
/* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */

795 
z2 = d5 + d3; 
796 
z4 = d5 + d1; 
797 
z5 = MULTIPLY(d3 + z4, FIX_1_175875602); 
798 

799 
tmp1 = MULTIPLY(d5, FIX_2_053119869); 
800 
tmp2 = MULTIPLY(d3, FIX_3_072711026); 
801 
tmp3 = MULTIPLY(d1, FIX_1_501321110); 
802 
z1 = MULTIPLY(d1, FIX_0_899976223); 
803 
z2 = MULTIPLY(z2, FIX_2_562915447); 
804 
z3 = MULTIPLY(d3, FIX_1_961570560); 
805 
z4 = MULTIPLY(z4, FIX_0_390180644); 
806 

807 
z3 += z5; 
808 
z4 += z5; 
809 

810 
tmp0 = z1 + z3; 
811 
tmp1 += z2 + z4; 
812 
tmp2 += z2 + z3; 
813 
tmp3 += z1 + z4; 
814 
} else {

815 
/* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */

816 
z2 = d5 + d3; 
817 

818 
z5 = MULTIPLY(z2, FIX_1_175875602); 
819 
tmp1 = MULTIPLY(d5, FIX_1_662939225); 
820 
z4 = MULTIPLY(d5, FIX_0_390180644); 
821 
z2 = MULTIPLY(z2, FIX_1_387039845); 
822 
tmp2 = MULTIPLY(d3, FIX_1_111140466); 
823 
z3 = MULTIPLY(d3, FIX_1_961570560); 
824 

825 
tmp0 = z3 + z5; 
826 
tmp1 += z2; 
827 
tmp2 += z2; 
828 
tmp3 = z4 + z5; 
829 
} 
830 
} else {

831 
if (d1) {

832 
/* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */

833 
z4 = d5 + d1; 
834 

835 
z5 = MULTIPLY(z4, FIX_1_175875602); 
836 
z1 = MULTIPLY(d1, FIX_0_899976223); 
837 
tmp3 = MULTIPLY(d1, FIX_0_601344887); 
838 
tmp1 = MULTIPLY(d5, FIX_0_509795579); 
839 
z2 = MULTIPLY(d5, FIX_2_562915447); 
840 
z4 = MULTIPLY(z4, FIX_0_785694958); 
841 

842 
tmp0 = z1 + z5; 
843 
tmp1 += z4; 
844 
tmp2 = z2 + z5; 
845 
tmp3 += z4; 
846 
} else {

847 
/* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */

848 
tmp0 = MULTIPLY(d5, FIX_1_175875602); 
849 
tmp1 = MULTIPLY(d5, FIX_0_275899380); 
850 
tmp2 = MULTIPLY(d5, FIX_1_387039845); 
851 
tmp3 = MULTIPLY(d5, FIX_0_785694958); 
852 
} 
853 
} 
854 
} else {

855 
if (d3) {

856 
if (d1) {

857 
/* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */

858 
z5 = d1 + d3; 
859 
tmp3 = MULTIPLY(d1, FIX_0_211164243); 
860 
tmp2 = MULTIPLY(d3, FIX_1_451774981); 
861 
z1 = MULTIPLY(d1, FIX_1_061594337); 
862 
z2 = MULTIPLY(d3, FIX_2_172734803); 
863 
z4 = MULTIPLY(z5, FIX_0_785694958); 
864 
z5 = MULTIPLY(z5, FIX_1_175875602); 
865 

866 
tmp0 = z1  z4; 
867 
tmp1 = z2 + z4; 
868 
tmp2 += z5; 
869 
tmp3 += z5; 
870 
} else {

871 
/* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */

872 
tmp0 = MULTIPLY(d3, FIX_0_785694958); 
873 
tmp1 = MULTIPLY(d3, FIX_1_387039845); 
874 
tmp2 = MULTIPLY(d3, FIX_0_275899380); 
875 
tmp3 = MULTIPLY(d3, FIX_1_175875602); 
876 
} 
877 
} else {

878 
if (d1) {

879 
/* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */

880 
tmp0 = MULTIPLY(d1, FIX_0_275899380); 
881 
tmp1 = MULTIPLY(d1, FIX_0_785694958); 
882 
tmp2 = MULTIPLY(d1, FIX_1_175875602); 
883 
tmp3 = MULTIPLY(d1, FIX_1_387039845); 
884 
} else {

885 
/* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */

886 
tmp0 = tmp1 = tmp2 = tmp3 = 0;

887 
} 
888 
} 
889 
} 
890 
} 
891  
892 
/* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */

893  
894 
dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp3,

895 
CONST_BITS+PASS1_BITS+3);

896 
dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp10  tmp3,

897 
CONST_BITS+PASS1_BITS+3);

898 
dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp11 + tmp2,

899 
CONST_BITS+PASS1_BITS+3);

900 
dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(tmp11  tmp2,

901 
CONST_BITS+PASS1_BITS+3);

902 
dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp12 + tmp1,

903 
CONST_BITS+PASS1_BITS+3);

904 
dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12  tmp1,

905 
CONST_BITS+PASS1_BITS+3);

906 
dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp13 + tmp0,

907 
CONST_BITS+PASS1_BITS+3);

908 
dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp13  tmp0,

909 
CONST_BITS+PASS1_BITS+3);

910 

911 
dataptr++; /* advance pointer to next column */

912 
} 
913 
} 
914  
915 
#undef DCTSIZE

916 
#define DCTSIZE 4 
917 
#define DCTSTRIDE 8 
918  
919 
void j_rev_dct4(DCTBLOCK data)

920 
{ 
921 
int32_t tmp0, tmp1, tmp2, tmp3; 
922 
int32_t tmp10, tmp11, tmp12, tmp13; 
923 
int32_t z1; 
924 
int32_t d0, d2, d4, d6; 
925 
register DCTELEM *dataptr;

926 
int rowctr;

927  
928 
/* Pass 1: process rows. */

929 
/* Note results are scaled up by sqrt(8) compared to a true IDCT; */

930 
/* furthermore, we scale the results by 2**PASS1_BITS. */

931  
932 
data[0] += 4; 
933 

934 
dataptr = data; 
935  
936 
for (rowctr = DCTSIZE1; rowctr >= 0; rowctr) { 
937 
/* Due to quantization, we will usually find that many of the input

938 
* coefficients are zero, especially the AC terms. We can exploit this

939 
* by shortcircuiting the IDCT calculation for any row in which all

940 
* the AC terms are zero. In that case each output is equal to the

941 
* DC coefficient (with scale factor as needed).

942 
* With typical images and quantization tables, half or more of the

943 
* row DCT calculations can be simplified this way.

944 
*/

945  
946 
register int *idataptr = (int*)dataptr; 
947  
948 
d0 = dataptr[0];

949 
d2 = dataptr[1];

950 
d4 = dataptr[2];

951 
d6 = dataptr[3];

952  
953 
if ((d2  d4  d6) == 0) { 
954 
/* AC terms all zero */

955 
if (d0) {

956 
/* Compute a 32 bit value to assign. */

957 
DCTELEM dcval = (DCTELEM) (d0 << PASS1_BITS); 
958 
register int v = (dcval & 0xffff)  ((dcval << 16) & 0xffff0000); 
959 

960 
idataptr[0] = v;

961 
idataptr[1] = v;

962 
} 
963 

964 
dataptr += DCTSTRIDE; /* advance pointer to next row */

965 
continue;

966 
} 
967 

968 
/* Even part: reverse the even part of the forward DCT. */

969 
/* The rotator is sqrt(2)*c(6). */

970 
if (d6) {

971 
if (d2) {

972 
/* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */

973 
z1 = MULTIPLY(d2 + d6, FIX_0_541196100); 
974 
tmp2 = z1 + MULTIPLY(d6, FIX_1_847759065); 
975 
tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); 
976  
977 
tmp0 = (d0 + d4) << CONST_BITS; 
978 
tmp1 = (d0  d4) << CONST_BITS; 
979  
980 
tmp10 = tmp0 + tmp3; 
981 
tmp13 = tmp0  tmp3; 
982 
tmp11 = tmp1 + tmp2; 
983 
tmp12 = tmp1  tmp2; 
984 
} else {

985 
/* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */

986 
tmp2 = MULTIPLY(d6, FIX_1_306562965); 
987 
tmp3 = MULTIPLY(d6, FIX_0_541196100); 
988  
989 
tmp0 = (d0 + d4) << CONST_BITS; 
990 
tmp1 = (d0  d4) << CONST_BITS; 
991  
992 
tmp10 = tmp0 + tmp3; 
993 
tmp13 = tmp0  tmp3; 
994 
tmp11 = tmp1 + tmp2; 
995 
tmp12 = tmp1  tmp2; 
996 
} 
997 
} else {

998 
if (d2) {

999 
/* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */

1000 
tmp2 = MULTIPLY(d2, FIX_0_541196100); 
1001 
tmp3 = MULTIPLY(d2, FIX_1_306562965); 
1002  
1003 
tmp0 = (d0 + d4) << CONST_BITS; 
1004 
tmp1 = (d0  d4) << CONST_BITS; 
1005  
1006 
tmp10 = tmp0 + tmp3; 
1007 
tmp13 = tmp0  tmp3; 
1008 
tmp11 = tmp1 + tmp2; 
1009 
tmp12 = tmp1  tmp2; 
1010 
} else {

1011 
/* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */

1012 
tmp10 = tmp13 = (d0 + d4) << CONST_BITS; 
1013 
tmp11 = tmp12 = (d0  d4) << CONST_BITS; 
1014 
} 
1015 
} 
1016  
1017 
/* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */

1018  
1019 
dataptr[0] = (DCTELEM) DESCALE(tmp10, CONST_BITSPASS1_BITS);

1020 
dataptr[1] = (DCTELEM) DESCALE(tmp11, CONST_BITSPASS1_BITS);

1021 
dataptr[2] = (DCTELEM) DESCALE(tmp12, CONST_BITSPASS1_BITS);

1022 
dataptr[3] = (DCTELEM) DESCALE(tmp13, CONST_BITSPASS1_BITS);

1023  
1024 
dataptr += DCTSTRIDE; /* advance pointer to next row */

1025 
} 
1026  
1027 
/* Pass 2: process columns. */

1028 
/* Note that we must descale the results by a factor of 8 == 2**3, */

1029 
/* and also undo the PASS1_BITS scaling. */

1030  
1031 
dataptr = data; 
1032 
for (rowctr = DCTSIZE1; rowctr >= 0; rowctr) { 
1033 
/* Columns of zeroes can be exploited in the same way as we did with rows.

1034 
* However, the row calculation has created many nonzero AC terms, so the

1035 
* simplification applies less often (typically 5% to 10% of the time).

1036 
* On machines with very fast multiplication, it's possible that the

1037 
* test takes more time than it's worth. In that case this section

1038 
* may be commented out.

1039 
*/

1040  
1041 
d0 = dataptr[DCTSTRIDE*0];

1042 
d2 = dataptr[DCTSTRIDE*1];

1043 
d4 = dataptr[DCTSTRIDE*2];

1044 
d6 = dataptr[DCTSTRIDE*3];

1045  
1046 
/* Even part: reverse the even part of the forward DCT. */

1047 
/* The rotator is sqrt(2)*c(6). */

1048 
if (d6) {

1049 
if (d2) {

1050 
/* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */

1051 
z1 = MULTIPLY(d2 + d6, FIX_0_541196100); 
1052 
tmp2 = z1 + MULTIPLY(d6, FIX_1_847759065); 
1053 
tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); 
1054  
1055 
tmp0 = (d0 + d4) << CONST_BITS; 
1056 
tmp1 = (d0  d4) << CONST_BITS; 
1057  
1058 
tmp10 = tmp0 + tmp3; 
1059 
tmp13 = tmp0  tmp3; 
1060 
tmp11 = tmp1 + tmp2; 
1061 
tmp12 = tmp1  tmp2; 
1062 
} else {

1063 
/* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */

1064 
tmp2 = MULTIPLY(d6, FIX_1_306562965); 
1065 
tmp3 = MULTIPLY(d6, FIX_0_541196100); 
1066  
1067 
tmp0 = (d0 + d4) << CONST_BITS; 
1068 
tmp1 = (d0  d4) << CONST_BITS; 
1069  
1070 
tmp10 = tmp0 + tmp3; 
1071 
tmp13 = tmp0  tmp3; 
1072 
tmp11 = tmp1 + tmp2; 
1073 
tmp12 = tmp1  tmp2; 
1074 
} 
1075 
} else {

1076 
if (d2) {

1077 
/* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */

1078 
tmp2 = MULTIPLY(d2, FIX_0_541196100); 
1079 
tmp3 = MULTIPLY(d2, FIX_1_306562965); 
1080  
1081 
tmp0 = (d0 + d4) << CONST_BITS; 
1082 
tmp1 = (d0  d4) << CONST_BITS; 
1083  
1084 
tmp10 = tmp0 + tmp3; 
1085 
tmp13 = tmp0  tmp3; 
1086 
tmp11 = tmp1 + tmp2; 
1087 
tmp12 = tmp1  tmp2; 
1088 
} else {

1089 
/* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */

1090 
tmp10 = tmp13 = (d0 + d4) << CONST_BITS; 
1091 
tmp11 = tmp12 = (d0  d4) << CONST_BITS; 
1092 
} 
1093 
} 
1094  
1095 
/* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */

1096  
1097 
dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3); 
1098 
dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3); 
1099 
dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3); 
1100 
dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3); 
1101 

1102 
dataptr++; /* advance pointer to next column */

1103 
} 
1104 
} 
1105  
1106 
void j_rev_dct2(DCTBLOCK data){

1107 
int d00, d01, d10, d11;

1108  
1109 
data[0] += 4; 
1110 
d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE]; 
1111 
d01 = data[0+0*DCTSTRIDE]  data[1+0*DCTSTRIDE]; 
1112 
d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE]; 
1113 
d11 = data[0+1*DCTSTRIDE]  data[1+1*DCTSTRIDE]; 
1114 

1115 
data[0+0*DCTSTRIDE]= (d00 + d10)>>3; 
1116 
data[1+0*DCTSTRIDE]= (d01 + d11)>>3; 
1117 
data[0+1*DCTSTRIDE]= (d00  d10)>>3; 
1118 
data[1+1*DCTSTRIDE]= (d01  d11)>>3; 
1119 
} 
1120  
1121 
void j_rev_dct1(DCTBLOCK data){

1122 
data[0] = (data[0] + 4)>>3; 
1123 
} 
1124  
1125 
#undef FIX

1126 
#undef CONST_BITS
