Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / vp3dsp_mmx.c @ 699b3f99

History | View | Annotate | Download (11.7 KB)

1
/*
2
 * Copyright (C) 2004 the ffmpeg project
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
/**
22
 * @file vp3dsp_mmx.c
23
 * MMX-optimized functions cribbed from the original VP3 source code.
24
 */
25

    
26
#include "dsputil.h"
27
#include "mmx.h"
28

    
29
#define IdctAdjustBeforeShift 8
30

    
31
/* (12 * 4) 2-byte memory locations ( = 96 bytes total)
32
 * idct_constants[0..15] = Mask table (M(I))
33
 * idct_constants[16..43] = Cosine table (C(I))
34
 * idct_constants[44..47] = 8
35
 */
36
static uint16_t idct_constants[(4 + 7 + 1) * 4];
37
static const uint16_t idct_cosine_table[7] = {
38
    64277, 60547, 54491, 46341, 36410, 25080, 12785
39
};
40

    
41
#define r0 mm0
42
#define r1 mm1
43
#define r2 mm2
44
#define r3 mm3
45
#define r4 mm4
46
#define r5 mm5
47
#define r6 mm6
48
#define r7 mm7
49

    
50
/* from original comments: The Macro does IDct on 4 1-D Dcts */
51
#define BeginIDCT() { \
52
    movq_m2r(*I(3), r2); \
53
    movq_m2r(*C(3), r6); \
54
    movq_r2r(r2, r4); \
55
    movq_m2r(*J(5), r7); \
56
    pmulhw_r2r(r6, r4);       /* r4 = c3*i3 - i3 */ \
57
    movq_m2r(*C(5), r1); \
58
    pmulhw_r2r(r7, r6);       /* r6 = c3*i5 - i5 */ \
59
    movq_r2r(r1, r5); \
60
    pmulhw_r2r(r2, r1);       /* r1 = c5*i3 - i3 */ \
61
    movq_m2r(*I(1), r3); \
62
    pmulhw_r2r(r7, r5);       /* r5 = c5*i5 - i5 */ \
63
    movq_m2r(*C(1), r0);      /* (all registers are in use) */ \
64
    paddw_r2r(r2, r4);        /* r4 = c3*i3 */ \
65
    paddw_r2r(r7, r6);        /* r6 = c3*i5 */ \
66
    paddw_r2r(r1, r2);        /* r2 = c5*i3 */ \
67
    movq_m2r(*J(7), r1); \
68
    paddw_r2r(r5, r7);        /* r7 = c5*i5 */ \
69
    movq_r2r(r0, r5);         /* r5 = c1 */ \
70
    pmulhw_r2r(r3, r0);       /* r0 = c1*i1 - i1 */ \
71
    paddsw_r2r(r7, r4);       /* r4 = C = c3*i3 + c5*i5 */ \
72
    pmulhw_r2r(r1, r5);       /* r5 = c1*i7 - i7 */ \
73
    movq_m2r(*C(7), r7); \
74
    psubsw_r2r(r2, r6);       /* r6 = D = c3*i5 - c5*i3 */ \
75
    paddw_r2r(r3, r0);        /* r0 = c1*i1 */ \
76
    pmulhw_r2r(r7, r3);       /* r3 = c7*i1 */ \
77
    movq_m2r(*I(2), r2); \
78
    pmulhw_r2r(r1, r7);       /* r7 = c7*i7 */ \
79
    paddw_r2r(r1, r5);        /* r5 = c1*i7 */ \
80
    movq_r2r(r2, r1);         /* r1 = i2 */ \
81
    pmulhw_m2r(*C(2), r2);    /* r2 = c2*i2 - i2 */ \
82
    psubsw_r2r(r5, r3);       /* r3 = B = c7*i1 - c1*i7 */ \
83
    movq_m2r(*J(6), r5); \
84
    paddsw_r2r(r7, r0);       /* r0 = A = c1*i1 + c7*i7 */ \
85
    movq_r2r(r5, r7);         /* r7 = i6 */ \
86
    psubsw_r2r(r4, r0);       /* r0 = A - C */ \
87
    pmulhw_m2r(*C(2), r5);    /* r5 = c2*i6 - i6 */ \
88
    paddw_r2r(r1, r2);        /* r2 = c2*i2 */ \
89
    pmulhw_m2r(*C(6), r1);    /* r1 = c6*i2 */ \
90
    paddsw_r2r(r4, r4);       /* r4 = C + C */ \
91
    paddsw_r2r(r0, r4);       /* r4 = C. = A + C */ \
92
    psubsw_r2r(r6, r3);       /* r3 = B - D */ \
93
    paddw_r2r(r7, r5);        /* r5 = c2*i6 */ \
94
    paddsw_r2r(r6, r6);       /* r6 = D + D */ \
95
    pmulhw_m2r(*C(6), r7);    /* r7 = c6*i6 */ \
96
    paddsw_r2r(r3, r6);       /* r6 = D. = B + D */ \
97
    movq_r2m(r4, *I(1));      /* save C. at I(1) */ \
98
    psubsw_r2r(r5, r1);       /* r1 = H = c6*i2 - c2*i6 */ \
99
    movq_m2r(*C(4), r4); \
100
    movq_r2r(r3, r5);         /* r5 = B - D */ \
101
    pmulhw_r2r(r4, r3);       /* r3 = (c4 - 1) * (B - D) */ \
102
    paddsw_r2r(r2, r7);       /* r7 = G = c6*i6 + c2*i2 */ \
103
    movq_r2m(r6, *I(2));      /* save D. at I(2) */ \
104
    movq_r2r(r0, r2);         /* r2 = A - C */ \
105
    movq_m2r(*I(0), r6); \
106
    pmulhw_r2r(r4, r0);       /* r0 = (c4 - 1) * (A - C) */ \
107
    paddw_r2r(r3, r5);        /* r5 = B. = c4 * (B - D) */ \
108
    movq_m2r(*J(4), r3); \
109
    psubsw_r2r(r1, r5);       /* r5 = B.. = B. - H */ \
110
    paddw_r2r(r0, r2);        /* r0 = A. = c4 * (A - C) */ \
111
    psubsw_r2r(r3, r6);       /* r6 = i0 - i4 */ \
112
    movq_r2r(r6, r0); \
113
    pmulhw_r2r(r4, r6);       /* r6 = (c4 - 1) * (i0 - i4) */ \
114
    paddsw_r2r(r3, r3);       /* r3 = i4 + i4 */ \
115
    paddsw_r2r(r1, r1);       /* r1 = H + H */ \
116
    paddsw_r2r(r0, r3);       /* r3 = i0 + i4 */ \
117
    paddsw_r2r(r5, r1);       /* r1 = H. = B + H */ \
118
    pmulhw_r2r(r3, r4);       /* r4 = (c4 - 1) * (i0 + i4) */ \
119
    paddsw_r2r(r0, r6);       /* r6 = F = c4 * (i0 - i4) */ \
120
    psubsw_r2r(r2, r6);       /* r6 = F. = F - A. */ \
121
    paddsw_r2r(r2, r2);       /* r2 = A. + A. */ \
122
    movq_m2r(*I(1), r0);      /* r0 = C. */ \
123
    paddsw_r2r(r6, r2);       /* r2 = A.. = F + A. */ \
124
    paddw_r2r(r3, r4);        /* r4 = E = c4 * (i0 + i4) */ \
125
    psubsw_r2r(r1, r2);       /* r2 = R2 = A.. - H. */ \
126
}
127

    
128
/* RowIDCT gets ready to transpose */
129
#define RowIDCT() { \
130
    \
131
    BeginIDCT(); \
132
    \
133
    movq_m2r(*I(2), r3);   /* r3 = D. */ \
134
    psubsw_r2r(r7, r4);    /* r4 = E. = E - G */ \
135
    paddsw_r2r(r1, r1);    /* r1 = H. + H. */ \
136
    paddsw_r2r(r7, r7);    /* r7 = G + G */ \
137
    paddsw_r2r(r2, r1);    /* r1 = R1 = A.. + H. */ \
138
    paddsw_r2r(r4, r7);    /* r7 = G. = E + G */ \
139
    psubsw_r2r(r3, r4);    /* r4 = R4 = E. - D. */ \
140
    paddsw_r2r(r3, r3); \
141
    psubsw_r2r(r5, r6);    /* r6 = R6 = F. - B.. */ \
142
    paddsw_r2r(r5, r5); \
143
    paddsw_r2r(r4, r3);    /* r3 = R3 = E. + D. */ \
144
    paddsw_r2r(r6, r5);    /* r5 = R5 = F. + B.. */ \
145
    psubsw_r2r(r0, r7);    /* r7 = R7 = G. - C. */ \
146
    paddsw_r2r(r0, r0); \
147
    movq_r2m(r1, *I(1));   /* save R1 */ \
148
    paddsw_r2r(r7, r0);    /* r0 = R0 = G. + C. */ \
149
}
150

    
151
/* Column IDCT normalizes and stores final results */
152
#define ColumnIDCT() { \
153
    \
154
    BeginIDCT(); \
155
    \
156
    paddsw_m2r(*Eight, r2);    /* adjust R2 (and R1) for shift */ \
157
    paddsw_r2r(r1, r1);        /* r1 = H. + H. */ \
158
    paddsw_r2r(r2, r1);        /* r1 = R1 = A.. + H. */ \
159
    psraw_i2r(4, r2);          /* r2 = NR2 */ \
160
    psubsw_r2r(r7, r4);        /* r4 = E. = E - G */ \
161
    psraw_i2r(4, r1);          /* r1 = NR1 */ \
162
    movq_m2r(*I(2), r3);       /* r3 = D. */ \
163
    paddsw_r2r(r7, r7);        /* r7 = G + G */ \
164
    movq_r2m(r2, *I(2));       /* store NR2 at I2 */ \
165
    paddsw_r2r(r4, r7);        /* r7 = G. = E + G */ \
166
    movq_r2m(r1, *I(1));       /* store NR1 at I1 */ \
167
    psubsw_r2r(r3, r4);        /* r4 = R4 = E. - D. */ \
168
    paddsw_m2r(*Eight, r4);    /* adjust R4 (and R3) for shift */ \
169
    paddsw_r2r(r3, r3);        /* r3 = D. + D. */ \
170
    paddsw_r2r(r4, r3);        /* r3 = R3 = E. + D. */ \
171
    psraw_i2r(4, r4);          /* r4 = NR4 */ \
172
    psubsw_r2r(r5, r6);        /* r6 = R6 = F. - B.. */ \
173
    psraw_i2r(4, r3);          /* r3 = NR3 */ \
174
    paddsw_m2r(*Eight, r6);    /* adjust R6 (and R5) for shift */ \
175
    paddsw_r2r(r5, r5);        /* r5 = B.. + B.. */ \
176
    paddsw_r2r(r6, r5);        /* r5 = R5 = F. + B.. */ \
177
    psraw_i2r(4, r6);          /* r6 = NR6 */ \
178
    movq_r2m(r4, *J(4));       /* store NR4 at J4 */ \
179
    psraw_i2r(4, r5);          /* r5 = NR5 */ \
180
    movq_r2m(r3, *I(3));       /* store NR3 at I3 */ \
181
    psubsw_r2r(r0, r7);        /* r7 = R7 = G. - C. */ \
182
    paddsw_m2r(*Eight, r7);    /* adjust R7 (and R0) for shift */ \
183
    paddsw_r2r(r0, r0);        /* r0 = C. + C. */ \
184
    paddsw_r2r(r7, r0);        /* r0 = R0 = G. + C. */ \
185
    psraw_i2r(4, r7);          /* r7 = NR7 */ \
186
    movq_r2m(r6, *J(6));       /* store NR6 at J6 */ \
187
    psraw_i2r(4, r0);          /* r0 = NR0 */ \
188
    movq_r2m(r5, *J(5));       /* store NR5 at J5 */ \
189
    movq_r2m(r7, *J(7));       /* store NR7 at J7 */ \
190
    movq_r2m(r0, *I(0));       /* store NR0 at I0 */ \
191
}
192

    
193
/* Following macro does two 4x4 transposes in place.
194

195
  At entry (we assume):
196

197
    r0 = a3 a2 a1 a0
198
    I(1) = b3 b2 b1 b0
199
    r2 = c3 c2 c1 c0
200
    r3 = d3 d2 d1 d0
201

202
    r4 = e3 e2 e1 e0
203
    r5 = f3 f2 f1 f0
204
    r6 = g3 g2 g1 g0
205
    r7 = h3 h2 h1 h0
206

207
  At exit, we have:
208

209
    I(0) = d0 c0 b0 a0
210
    I(1) = d1 c1 b1 a1
211
    I(2) = d2 c2 b2 a2
212
    I(3) = d3 c3 b3 a3
213

214
    J(4) = h0 g0 f0 e0
215
    J(5) = h1 g1 f1 e1
216
    J(6) = h2 g2 f2 e2
217
    J(7) = h3 g3 f3 e3
218

219
   I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
220
   J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
221

222
   Since r1 is free at entry, we calculate the Js first. */
223

    
224
#define Transpose() { \
225
    movq_r2r(r4, r1);         /* r1 = e3 e2 e1 e0 */ \
226
    punpcklwd_r2r(r5, r4);    /* r4 = f1 e1 f0 e0 */ \
227
    movq_r2m(r0, *I(0));      /* save a3 a2 a1 a0 */ \
228
    punpckhwd_r2r(r5, r1);    /* r1 = f3 e3 f2 e2 */ \
229
    movq_r2r(r6, r0);         /* r0 = g3 g2 g1 g0 */ \
230
    punpcklwd_r2r(r7, r6);    /* r6 = h1 g1 h0 g0 */ \
231
    movq_r2r(r4, r5);         /* r5 = f1 e1 f0 e0 */ \
232
    punpckldq_r2r(r6, r4);    /* r4 = h0 g0 f0 e0 = R4 */ \
233
    punpckhdq_r2r(r6, r5);    /* r5 = h1 g1 f1 e1 = R5 */ \
234
    movq_r2r(r1, r6);         /* r6 = f3 e3 f2 e2 */ \
235
    movq_r2m(r4, *J(4)); \
236
    punpckhwd_r2r(r7, r0);    /* r0 = h3 g3 h2 g2 */ \
237
    movq_r2m(r5, *J(5)); \
238
    punpckhdq_r2r(r0, r6);    /* r6 = h3 g3 f3 e3 = R7 */ \
239
    movq_m2r(*I(0), r4);      /* r4 = a3 a2 a1 a0 */ \
240
    punpckldq_r2r(r0, r1);    /* r1 = h2 g2 f2 e2 = R6 */ \
241
    movq_m2r(*I(1), r5);      /* r5 = b3 b2 b1 b0 */ \
242
    movq_r2r(r4, r0);         /* r0 = a3 a2 a1 a0 */ \
243
    movq_r2m(r6, *J(7)); \
244
    punpcklwd_r2r(r5, r0);    /* r0 = b1 a1 b0 a0 */ \
245
    movq_r2m(r1, *J(6)); \
246
    punpckhwd_r2r(r5, r4);    /* r4 = b3 a3 b2 a2 */ \
247
    movq_r2r(r2, r5);         /* r5 = c3 c2 c1 c0 */ \
248
    punpcklwd_r2r(r3, r2);    /* r2 = d1 c1 d0 c0 */ \
249
    movq_r2r(r0, r1);         /* r1 = b1 a1 b0 a0 */ \
250
    punpckldq_r2r(r2, r0);    /* r0 = d0 c0 b0 a0 = R0 */ \
251
    punpckhdq_r2r(r2, r1);    /* r1 = d1 c1 b1 a1 = R1 */ \
252
    movq_r2r(r4, r2);         /* r2 = b3 a3 b2 a2 */ \
253
    movq_r2m(r0, *I(0)); \
254
    punpckhwd_r2r(r3, r5);    /* r5 = d3 c3 d2 c2 */ \
255
    movq_r2m(r1, *I(1)); \
256
    punpckhdq_r2r(r5, r4);    /* r4 = d3 c3 b3 a3 = R3 */ \
257
    punpckldq_r2r(r5, r2);    /* r2 = d2 c2 b2 a2 = R2 */ \
258
    movq_r2m(r4, *I(3)); \
259
    movq_r2m(r2, *I(2)); \
260
}
261

    
262
void ff_vp3_dsp_init_mmx(void)
263
{
264
    int j = 16;
265
    uint16_t *p;
266

    
267
    j = 1;
268
    do {
269
        p = idct_constants + ((j + 3) << 2);
270
        p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
271
    } while (++j <= 7);
272

    
273
    idct_constants[44] = idct_constants[45] =
274
    idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
275
}
276

    
277
void ff_vp3_idct_mmx(int16_t *output_data)
278
{
279
    /* eax = quantized input
280
     * ebx = dequantizer matrix
281
     * ecx = IDCT constants
282
     *  M(I) = ecx + MaskOffset(0) + I * 8
283
     *  C(I) = ecx + CosineOffset(32) + (I-1) * 8
284
     * edx = output
285
     * r0..r7 = mm0..mm7
286
     */
287

    
288
#define C(x) (idct_constants + 16 + (x - 1) * 4)
289
#define Eight (idct_constants + 44)
290

    
291
    /* at this point, function has completed dequantization + dezigzag +
292
     * partial transposition; now do the idct itself */
293
#define I(K) (output_data + K * 8)
294
#define J(K) (output_data + ((K - 4) * 8) + 4)
295

    
296
    RowIDCT();
297
    Transpose();
298

    
299
#undef I
300
#undef J
301
#define I(K) (output_data + (K * 8) + 32)
302
#define J(K) (output_data + ((K - 4) * 8) + 36)
303

    
304
    RowIDCT();
305
    Transpose();
306

    
307
#undef I
308
#undef J
309
#define I(K) (output_data + K * 8)
310
#define J(K) (output_data + K * 8)
311

    
312
    ColumnIDCT();
313

    
314
#undef I
315
#undef J
316
#define I(K) (output_data + (K * 8) + 4)
317
#define J(K) (output_data + (K * 8) + 4)
318

    
319
    ColumnIDCT();
320

    
321
#undef I
322
#undef J
323

    
324
}
325

    
326
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
327
{
328
    ff_vp3_idct_mmx(block);
329
    put_signed_pixels_clamped_mmx(block, dest, line_size);
330
}
331

    
332
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
333
{
334
    ff_vp3_idct_mmx(block);
335
    add_pixels_clamped_mmx(block, dest, line_size);
336
}