Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / vp3dsp.asm @ 2912e87a

History | View | Annotate | Download (20.7 KB)

1
;******************************************************************************
2
;* MMX/SSE2-optimized functions for the VP3 decoder
3
;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
4
;*
5
;* This file is part of Libav.
6
;*
7
;* Libav is free software; you can redistribute it and/or
8
;* modify it under the terms of the GNU Lesser General Public
9
;* License as published by the Free Software Foundation; either
10
;* version 2.1 of the License, or (at your option) any later version.
11
;*
12
;* Libav is distributed in the hope that it will be useful,
13
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
;* Lesser General Public License for more details.
16
;*
17
;* You should have received a copy of the GNU Lesser General Public
18
;* License along with Libav; if not, write to the Free Software
19
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
;******************************************************************************
21

    
22
%include "x86inc.asm"
23
%include "x86util.asm"
24

    
25
; MMX-optimized functions cribbed from the original VP3 source code.
26

    
27
SECTION_RODATA
28

    
29
vp3_idct_data: times 8 dw 64277
30
               times 8 dw 60547
31
               times 8 dw 54491
32
               times 8 dw 46341
33
               times 8 dw 36410
34
               times 8 dw 25080
35
               times 8 dw 12785
36

    
37
cextern pb_1
38
cextern pb_3
39
cextern pb_7
40
cextern pb_1F
41
cextern pb_81
42

    
43
cextern pw_8
44

    
45
cextern put_signed_pixels_clamped_mmx
46
cextern add_pixels_clamped_mmx
47

    
48
SECTION .text
49

    
50
; this is off by one or two for some cases when filter_limit is greater than 63
51
; in:  p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
52
; out: p1 in mm4, p2 in mm3
53
%macro VP3_LOOP_FILTER 0
54
    movq          m7, m6
55
    pand          m6, [pb_7]    ; p0&7
56
    psrlw         m7, 3
57
    pand          m7, [pb_1F]   ; p0>>3
58
    movq          m3, m2        ; p2
59
    pxor          m2, m4
60
    pand          m2, [pb_1]    ; (p2^p1)&1
61
    movq          m5, m2
62
    paddb         m2, m2
63
    paddb         m2, m5        ; 3*(p2^p1)&1
64
    paddb         m2, m6        ; extra bits lost in shifts
65
    pcmpeqb       m0, m0
66
    pxor          m1, m0        ; 255 - p3
67
    pavgb         m1, m2        ; (256 - p3 + extrabits) >> 1
68
    pxor          m0, m4        ; 255 - p1
69
    pavgb         m0, m3        ; (256 + p2-p1) >> 1
70
    paddb         m1, [pb_3]
71
    pavgb         m1, m0        ; 128+2+(   p2-p1  - p3) >> 2
72
    pavgb         m1, m0        ; 128+1+(3*(p2-p1) - p3) >> 3
73
    paddusb       m7, m1        ; d+128+1
74
    movq          m6, [pb_81]
75
    psubusb       m6, m7
76
    psubusb       m7, [pb_81]
77

    
78
    movq          m5, [r2+516]  ; flim
79
    pminub        m6, m5
80
    pminub        m7, m5
81
    movq          m0, m6
82
    movq          m1, m7
83
    paddb         m6, m6
84
    paddb         m7, m7
85
    pminub        m6, m5
86
    pminub        m7, m5
87
    psubb         m6, m0
88
    psubb         m7, m1
89
    paddusb       m4, m7
90
    psubusb       m4, m6
91
    psubusb       m3, m7
92
    paddusb       m3, m6
93
%endmacro
94

    
95
%macro STORE_4_WORDS 1
96
    movd         r2d, %1
97
    mov  [r0     -1], r2w
98
    psrlq         %1, 32
99
    shr           r2, 16
100
    mov  [r0+r1  -1], r2w
101
    movd         r2d, %1
102
    mov  [r0+r1*2-1], r2w
103
    shr           r2, 16
104
    mov  [r0+r3  -1], r2w
105
%endmacro
106

    
107
INIT_MMX
108
cglobal vp3_v_loop_filter_mmx2, 3, 4
109
%ifdef ARCH_X86_64
110
    movsxd        r1, r1d
111
%endif
112
    mov           r3, r1
113
    neg           r1
114
    movq          m6, [r0+r1*2]
115
    movq          m4, [r0+r1  ]
116
    movq          m2, [r0     ]
117
    movq          m1, [r0+r3  ]
118

    
119
    VP3_LOOP_FILTER
120

    
121
    movq     [r0+r1], m4
122
    movq     [r0   ], m3
123
    RET
124

    
125
cglobal vp3_h_loop_filter_mmx2, 3, 4
126
%ifdef ARCH_X86_64
127
    movsxd        r1, r1d
128
%endif
129
    lea           r3, [r1*3]
130

    
131
    movd          m6, [r0     -2]
132
    movd          m4, [r0+r1  -2]
133
    movd          m2, [r0+r1*2-2]
134
    movd          m1, [r0+r3  -2]
135
    lea           r0, [r0+r1*4  ]
136
    punpcklbw     m6, [r0     -2]
137
    punpcklbw     m4, [r0+r1  -2]
138
    punpcklbw     m2, [r0+r1*2-2]
139
    punpcklbw     m1, [r0+r3  -2]
140
    sub           r0, r3
141
    sub           r0, r1
142

    
143
    TRANSPOSE4x4B  6, 4, 2, 1, 0
144
    VP3_LOOP_FILTER
145
    SBUTTERFLY    bw, 4, 3, 5
146

    
147
    STORE_4_WORDS m4
148
    lea           r0, [r0+r1*4  ]
149
    STORE_4_WORDS m3
150
    RET
151

    
152
; from original comments: The Macro does IDct on 4 1-D Dcts
153
%macro BeginIDCT 0
154
    movq          m2, I(3)
155
    movq          m6, C(3)
156
    movq          m4, m2
157
    movq          m7, J(5)
158
    pmulhw        m4, m6        ; r4 = c3*i3 - i3
159
    movq          m1, C(5)
160
    pmulhw        m6, m7        ; r6 = c3*i5 - i5
161
    movq          m5, m1
162
    pmulhw        m1, m2        ; r1 = c5*i3 - i3
163
    movq          m3, I(1)
164
    pmulhw        m5, m7        ; r5 = c5*i5 - i5
165
    movq          m0, C(1)
166
    paddw         m4, m2        ; r4 = c3*i3
167
    paddw         m6, m7        ; r6 = c3*i5
168
    paddw         m2, m1        ; r2 = c5*i3
169
    movq          m1, J(7)
170
    paddw         m7, m5        ; r7 = c5*i5
171
    movq          m5, m0        ; r5 = c1
172
    pmulhw        m0, m3        ; r0 = c1*i1 - i1
173
    paddsw        m4, m7        ; r4 = C = c3*i3 + c5*i5
174
    pmulhw        m5, m1        ; r5 = c1*i7 - i7
175
    movq          m7, C(7)
176
    psubsw        m6, m2        ; r6 = D = c3*i5 - c5*i3
177
    paddw         m0, m3        ; r0 = c1*i1
178
    pmulhw        m3, m7        ; r3 = c7*i1
179
    movq          m2, I(2)
180
    pmulhw        m7, m1        ; r7 = c7*i7
181
    paddw         m5, m1        ; r5 = c1*i7
182
    movq          m1, m2        ; r1 = i2
183
    pmulhw        m2, C(2)      ; r2 = c2*i2 - i2
184
    psubsw        m3, m5        ; r3 = B = c7*i1 - c1*i7
185
    movq          m5, J(6)
186
    paddsw        m0, m7        ; r0 = A = c1*i1 + c7*i7
187
    movq          m7, m5        ; r7 = i6
188
    psubsw        m0, m4        ; r0 = A - C
189
    pmulhw        m5, C(2)      ; r5 = c2*i6 - i6
190
    paddw         m2, m1        ; r2 = c2*i2
191
    pmulhw        m1, C(6)      ; r1 = c6*i2
192
    paddsw        m4, m4        ; r4 = C + C
193
    paddsw        m4, m0        ; r4 = C. = A + C
194
    psubsw        m3, m6        ; r3 = B - D
195
    paddw         m5, m7        ; r5 = c2*i6
196
    paddsw        m6, m6        ; r6 = D + D
197
    pmulhw        m7, C(6)      ; r7 = c6*i6
198
    paddsw        m6, m3        ; r6 = D. = B + D
199
    movq        I(1), m4        ; save C. at I(1)
200
    psubsw        m1, m5        ; r1 = H = c6*i2 - c2*i6
201
    movq          m4, C(4)
202
    movq          m5, m3        ; r5 = B - D
203
    pmulhw        m3, m4        ; r3 = (c4 - 1) * (B - D)
204
    paddsw        m7, m2        ; r3 = (c4 - 1) * (B - D)
205
    movq        I(2), m6        ; save D. at I(2)
206
    movq          m2, m0        ; r2 = A - C
207
    movq          m6, I(0)
208
    pmulhw        m0, m4        ; r0 = (c4 - 1) * (A - C)
209
    paddw         m5, m3        ; r5 = B. = c4 * (B - D)
210
    movq          m3, J(4)
211
    psubsw        m5, m1        ; r5 = B.. = B. - H
212
    paddw         m2, m0        ; r0 = A. = c4 * (A - C)
213
    psubsw        m6, m3        ; r6 = i0 - i4
214
    movq          m0, m6
215
    pmulhw        m6, m4        ; r6 = (c4 - 1) * (i0 - i4)
216
    paddsw        m3, m3        ; r3 = i4 + i4
217
    paddsw        m1, m1        ; r1 = H + H
218
    paddsw        m3, m0        ; r3 = i0 + i4
219
    paddsw        m1, m5        ; r1 = H. = B + H
220
    pmulhw        m4, m3        ; r4 = (c4 - 1) * (i0 + i4)
221
    paddsw        m6, m0        ; r6 = F = c4 * (i0 - i4)
222
    psubsw        m6, m2        ; r6 = F. = F - A.
223
    paddsw        m2, m2        ; r2 = A. + A.
224
    movq          m0, I(1)      ; r0 = C.
225
    paddsw        m2, m6        ; r2 = A.. = F + A.
226
    paddw         m4, m3        ; r4 = E = c4 * (i0 + i4)
227
    psubsw        m2, m1        ; r2 = R2 = A.. - H.
228
%endmacro
229

    
230
; RowIDCT gets ready to transpose
231
%macro RowIDCT 0
232
    BeginIDCT
233
    movq          m3, I(2)      ; r3 = D.
234
    psubsw        m4, m7        ; r4 = E. = E - G
235
    paddsw        m1, m1        ; r1 = H. + H.
236
    paddsw        m7, m7        ; r7 = G + G
237
    paddsw        m1, m2        ; r1 = R1 = A.. + H.
238
    paddsw        m7, m4        ; r1 = R1 = A.. + H.
239
    psubsw        m4, m3        ; r4 = R4 = E. - D.
240
    paddsw        m3, m3
241
    psubsw        m6, m5        ; r6 = R6 = F. - B..
242
    paddsw        m5, m5
243
    paddsw        m3, m4        ; r3 = R3 = E. + D.
244
    paddsw        m5, m6        ; r5 = R5 = F. + B..
245
    psubsw        m7, m0        ; r7 = R7 = G. - C.
246
    paddsw        m0, m0
247
    movq        I(1), m1        ; save R1
248
    paddsw        m0, m7        ; r0 = R0 = G. + C.
249
%endmacro
250

    
251
; Column IDCT normalizes and stores final results
252
%macro ColumnIDCT 0
253
    BeginIDCT
254
    paddsw        m2, OC_8      ; adjust R2 (and R1) for shift
255
    paddsw        m1, m1        ; r1 = H. + H.
256
    paddsw        m1, m2        ; r1 = R1 = A.. + H.
257
    psraw         m2, 4         ; r2 = NR2
258
    psubsw        m4, m7        ; r4 = E. = E - G
259
    psraw         m1, 4         ; r1 = NR2
260
    movq          m3, I(2)      ; r3 = D.
261
    paddsw        m7, m7        ; r7 = G + G
262
    movq        I(2), m2        ; store NR2 at I2
263
    paddsw        m7, m4        ; r7 = G. = E + G
264
    movq        I(1), m1        ; store NR1 at I1
265
    psubsw        m4, m3        ; r4 = R4 = E. - D.
266
    paddsw        m4, OC_8      ; adjust R4 (and R3) for shift
267
    paddsw        m3, m3        ; r3 = D. + D.
268
    paddsw        m3, m4        ; r3 = R3 = E. + D.
269
    psraw         m4, 4         ; r4 = NR4
270
    psubsw        m6, m5        ; r6 = R6 = F. - B..
271
    psraw         m3, 4         ; r3 = NR3
272
    paddsw        m6, OC_8      ; adjust R6 (and R5) for shift
273
    paddsw        m5, m5        ; r5 = B.. + B..
274
    paddsw        m5, m6        ; r5 = R5 = F. + B..
275
    psraw         m6, 4         ; r6 = NR6
276
    movq        J(4), m4        ; store NR4 at J4
277
    psraw         m5, 4         ; r5 = NR5
278
    movq        I(3), m3        ; store NR3 at I3
279
    psubsw        m7, m0        ; r7 = R7 = G. - C.
280
    paddsw        m7, OC_8      ; adjust R7 (and R0) for shift
281
    paddsw        m0, m0        ; r0 = C. + C.
282
    paddsw        m0, m7        ; r0 = R0 = G. + C.
283
    psraw         m7, 4         ; r7 = NR7
284
    movq        J(6), m6        ; store NR6 at J6
285
    psraw         m0, 4         ; r0 = NR0
286
    movq        J(5), m5        ; store NR5 at J5
287
    movq        J(7), m7        ; store NR7 at J7
288
    movq        I(0), m0        ; store NR0 at I0
289
%endmacro
290

    
291
; Following macro does two 4x4 transposes in place.
292
;
293
; At entry (we assume):
294
;
295
;   r0 = a3 a2 a1 a0
296
;   I(1) = b3 b2 b1 b0
297
;   r2 = c3 c2 c1 c0
298
;   r3 = d3 d2 d1 d0
299
;
300
;   r4 = e3 e2 e1 e0
301
;   r5 = f3 f2 f1 f0
302
;   r6 = g3 g2 g1 g0
303
;   r7 = h3 h2 h1 h0
304
;
305
; At exit, we have:
306
;
307
;   I(0) = d0 c0 b0 a0
308
;   I(1) = d1 c1 b1 a1
309
;   I(2) = d2 c2 b2 a2
310
;   I(3) = d3 c3 b3 a3
311
;
312
;   J(4) = h0 g0 f0 e0
313
;   J(5) = h1 g1 f1 e1
314
;   J(6) = h2 g2 f2 e2
315
;   J(7) = h3 g3 f3 e3
316
;
317
;  I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
318
;  J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
319
;
320
;  Since r1 is free at entry, we calculate the Js first.
321
%macro Transpose 0
322
    movq          m1, m4        ; r1 = e3 e2 e1 e0
323
    punpcklwd     m4, m5        ; r4 = f1 e1 f0 e0
324
    movq        I(0), m0        ; save a3 a2 a1 a0
325
    punpckhwd     m1, m5        ; r1 = f3 e3 f2 e2
326
    movq          m0, m6        ; r0 = g3 g2 g1 g0
327
    punpcklwd     m6, m7        ; r6 = h1 g1 h0 g0
328
    movq          m5, m4        ; r5 = f1 e1 f0 e0
329
    punpckldq     m4, m6        ; r4 = h0 g0 f0 e0 = R4
330
    punpckhdq     m5, m6        ; r5 = h1 g1 f1 e1 = R5
331
    movq          m6, m1        ; r6 = f3 e3 f2 e2
332
    movq        J(4), m4
333
    punpckhwd     m0, m7        ; r0 = h3 g3 h2 g2
334
    movq        J(5), m5
335
    punpckhdq     m6, m0        ; r6 = h3 g3 f3 e3 = R7
336
    movq          m4, I(0)      ; r4 = a3 a2 a1 a0
337
    punpckldq     m1, m0        ; r1 = h2 g2 f2 e2 = R6
338
    movq          m5, I(1)      ; r5 = b3 b2 b1 b0
339
    movq          m0, m4        ; r0 = a3 a2 a1 a0
340
    movq        J(7), m6
341
    punpcklwd     m0, m5        ; r0 = b1 a1 b0 a0
342
    movq        J(6), m1
343
    punpckhwd     m4, m5        ; r4 = b3 a3 b2 a2
344
    movq          m5, m2        ; r5 = c3 c2 c1 c0
345
    punpcklwd     m2, m3        ; r2 = d1 c1 d0 c0
346
    movq          m1, m0        ; r1 = b1 a1 b0 a0
347
    punpckldq     m0, m2        ; r0 = d0 c0 b0 a0 = R0
348
    punpckhdq     m1, m2        ; r1 = d1 c1 b1 a1 = R1
349
    movq          m2, m4        ; r2 = b3 a3 b2 a2
350
    movq        I(0), m0
351
    punpckhwd     m5, m3        ; r5 = d3 c3 d2 c2
352
    movq        I(1), m1
353
    punpckhdq     m4, m5        ; r4 = d3 c3 b3 a3 = R3
354
    punpckldq     m2, m5        ; r2 = d2 c2 b2 a2 = R2
355
    movq        I(3), m4
356
    movq        I(2), m2
357
%endmacro
358

    
359
%macro VP3_IDCT_mmx 1
360
    ; eax = quantized input
361
    ; ebx = dequantizer matrix
362
    ; ecx = IDCT constants
363
    ;  M(I) = ecx + MaskOffset(0) + I * 8
364
    ;  C(I) = ecx + CosineOffset(32) + (I-1) * 8
365
    ; edx = output
366
    ; r0..r7 = mm0..mm7
367
%define OC_8 [pw_8]
368
%define C(x) [vp3_idct_data+16*(x-1)]
369

    
370
    ; at this point, function has completed dequantization + dezigzag +
371
    ; partial transposition; now do the idct itself
372
%define I(x) [%1+16* x     ]
373
%define J(x) [%1+16*(x-4)+8]
374
    RowIDCT
375
    Transpose
376

    
377
%define I(x) [%1+16* x   +64]
378
%define J(x) [%1+16*(x-4)+72]
379
    RowIDCT
380
    Transpose
381

    
382
%define I(x) [%1+16*x]
383
%define J(x) [%1+16*x]
384
    ColumnIDCT
385

    
386
%define I(x) [%1+16*x+8]
387
%define J(x) [%1+16*x+8]
388
    ColumnIDCT
389
%endmacro
390

    
391
%macro VP3_1D_IDCT_SSE2 0
392
    movdqa        m2, I(3)      ; xmm2 = i3
393
    movdqa        m6, C(3)      ; xmm6 = c3
394
    movdqa        m4, m2        ; xmm4 = i3
395
    movdqa        m7, I(5)      ; xmm7 = i5
396
    pmulhw        m4, m6        ; xmm4 = c3 * i3 - i3
397
    movdqa        m1, C(5)      ; xmm1 = c5
398
    pmulhw        m6, m7        ; xmm6 = c3 * i5 - i5
399
    movdqa        m5, m1        ; xmm5 = c5
400
    pmulhw        m1, m2        ; xmm1 = c5 * i3 - i3
401
    movdqa        m3, I(1)      ; xmm3 = i1
402
    pmulhw        m5, m7        ; xmm5 = c5 * i5 - i5
403
    movdqa        m0, C(1)      ; xmm0 = c1
404
    paddw         m4, m2        ; xmm4 = c3 * i3
405
    paddw         m6, m7        ; xmm6 = c3 * i5
406
    paddw         m2, m1        ; xmm2 = c5 * i3
407
    movdqa        m1, I(7)      ; xmm1 = i7
408
    paddw         m7, m5        ; xmm7 = c5 * i5
409
    movdqa        m5, m0        ; xmm5 = c1
410
    pmulhw        m0, m3        ; xmm0 = c1 * i1 - i1
411
    paddsw        m4, m7        ; xmm4 = c3 * i3 + c5 * i5 = C
412
    pmulhw        m5, m1        ; xmm5 = c1 * i7 - i7
413
    movdqa        m7, C(7)      ; xmm7 = c7
414
    psubsw        m6, m2        ; xmm6 = c3 * i5 - c5 * i3 = D
415
    paddw         m0, m3        ; xmm0 = c1 * i1
416
    pmulhw        m3, m7        ; xmm3 = c7 * i1
417
    movdqa        m2, I(2)      ; xmm2 = i2
418
    pmulhw        m7, m1        ; xmm7 = c7 * i7
419
    paddw         m5, m1        ; xmm5 = c1 * i7
420
    movdqa        m1, m2        ; xmm1 = i2
421
    pmulhw        m2, C(2)      ; xmm2 = i2 * c2 -i2
422
    psubsw        m3, m5        ; xmm3 = c7 * i1 - c1 * i7 = B
423
    movdqa        m5, I(6)      ; xmm5 = i6
424
    paddsw        m0, m7        ; xmm0 = c1 * i1 + c7 * i7 = A
425
    movdqa        m7, m5        ; xmm7 = i6
426
    psubsw        m0, m4        ; xmm0 = A - C
427
    pmulhw        m5, C(2)      ; xmm5 = c2 * i6 - i6
428
    paddw         m2, m1        ; xmm2 = i2 * c2
429
    pmulhw        m1, C(6)      ; xmm1 = c6 * i2
430
    paddsw        m4, m4        ; xmm4 = C + C
431
    paddsw        m4, m0        ; xmm4 = A + C = C.
432
    psubsw        m3, m6        ; xmm3 = B - D
433
    paddw         m5, m7        ; xmm5 = c2 * i6
434
    paddsw        m6, m6        ; xmm6 = D + D
435
    pmulhw        m7, C(6)      ; xmm7 = c6 * i6
436
    paddsw        m6, m3        ; xmm6 = B + D = D.
437
    movdqa      I(1), m4        ; Save C. at I(1)
438
    psubsw        m1, m5        ; xmm1 = c6 * i2 - c2 * i6 = H
439
    movdqa        m4, C(4)      ; xmm4 = C4
440
    movdqa        m5, m3        ; xmm5 = B - D
441
    pmulhw        m3, m4        ; xmm3 = ( c4 -1 ) * ( B - D )
442
    paddsw        m7, m2        ; xmm7 = c2 * i2 + c6 * i6 = G
443
    movdqa      I(2), m6        ; save D. at I(2)
444
    movdqa        m2, m0        ; xmm2 = A - C
445
    movdqa        m6, I(0)      ; xmm6 = i0
446
    pmulhw        m0, m4        ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
447
    paddw         m5, m3        ; xmm5 = c4 * ( B - D ) = B.
448
    movdqa        m3, I(4)      ; xmm3 = i4
449
    psubsw        m5, m1        ; xmm5 = B. - H = B..
450
    paddw         m2, m0        ; xmm2 = c4 * ( A - C) = A.
451
    psubsw        m6, m3        ; xmm6 = i0 - i4
452
    movdqa        m0, m6        ; xmm0 = i0 - i4
453
    pmulhw        m6, m4        ; xmm6 = (c4 - 1) * (i0 - i4) = F
454
    paddsw        m3, m3        ; xmm3 = i4 + i4
455
    paddsw        m1, m1        ; xmm1 = H + H
456
    paddsw        m3, m0        ; xmm3 = i0 + i4
457
    paddsw        m1, m5        ; xmm1 = B. + H = H.
458
    pmulhw        m4, m3        ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
459
    paddw         m6, m0        ; xmm6 = c4 * ( i0 - i4 )
460
    psubsw        m6, m2        ; xmm6 = F - A. = F.
461
    paddsw        m2, m2        ; xmm2 = A. + A.
462
    movdqa        m0, I(1)      ; Load        C. from I(1)
463
    paddsw        m2, m6        ; xmm2 = F + A. = A..
464
    paddw         m4, m3        ; xmm4 = c4 * ( i0 + i4 ) = 3
465
    psubsw        m2, m1        ; xmm2 = A.. - H. = R2
466
    ADD(m2)                     ; Adjust R2 and R1 before shifting
467
    paddsw        m1, m1        ; xmm1 = H. + H.
468
    paddsw        m1, m2        ; xmm1 = A.. + H. = R1
469
    SHIFT(m2)                   ; xmm2 = op2
470
    psubsw        m4, m7        ; xmm4 = E - G = E.
471
    SHIFT(m1)                   ; xmm1 = op1
472
    movdqa        m3, I(2)      ; Load D. from I(2)
473
    paddsw        m7, m7        ; xmm7 = G + G
474
    paddsw        m7, m4        ; xmm7 = E + G = G.
475
    psubsw        m4, m3        ; xmm4 = E. - D. = R4
476
    ADD(m4)                     ; Adjust R4 and R3 before shifting
477
    paddsw        m3, m3        ; xmm3 = D. + D.
478
    paddsw        m3, m4        ; xmm3 = E. + D. = R3
479
    SHIFT(m4)                   ; xmm4 = op4
480
    psubsw        m6, m5        ; xmm6 = F. - B..= R6
481
    SHIFT(m3)                   ; xmm3 = op3
482
    ADD(m6)                     ; Adjust R6 and R5 before shifting
483
    paddsw        m5, m5        ; xmm5 = B.. + B..
484
    paddsw        m5, m6        ; xmm5 = F. + B.. = R5
485
    SHIFT(m6)                   ; xmm6 = op6
486
    SHIFT(m5)                   ; xmm5 = op5
487
    psubsw        m7, m0        ; xmm7 = G. - C. = R7
488
    ADD(m7)                     ; Adjust R7 and R0 before shifting
489
    paddsw        m0, m0        ; xmm0 = C. + C.
490
    paddsw        m0, m7        ; xmm0 = G. + C.
491
    SHIFT(m7)                   ; xmm7 = op7
492
    SHIFT(m0)                   ; xmm0 = op0
493
%endmacro
494

    
495
%macro PUT_BLOCK 8
496
    movdqa      O(0), m%1
497
    movdqa      O(1), m%2
498
    movdqa      O(2), m%3
499
    movdqa      O(3), m%4
500
    movdqa      O(4), m%5
501
    movdqa      O(5), m%6
502
    movdqa      O(6), m%7
503
    movdqa      O(7), m%8
504
%endmacro
505

    
506
%macro VP3_IDCT_sse2 1
507
%define I(x) [%1+16*x]
508
%define O(x) [%1+16*x]
509
%define C(x) [vp3_idct_data+16*(x-1)]
510
%define SHIFT(x)
511
%define ADD(x)
512
        VP3_1D_IDCT_SSE2
513
%ifdef ARCH_X86_64
514
        TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
515
%else
516
        TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16]
517
%endif
518
        PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
519

    
520
%define SHIFT(x) psraw  x, 4
521
%define ADD(x)   paddsw x, [pw_8]
522
        VP3_1D_IDCT_SSE2
523
        PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
524
%endmacro
525

    
526
%macro vp3_idct_funcs 3
527
cglobal vp3_idct_%1, 1, 1, %2
528
    VP3_IDCT_%1   r0
529
    RET
530

    
531
cglobal vp3_idct_put_%1, 3, %3, %2
532
    VP3_IDCT_%1   r2
533
%ifdef ARCH_X86_64
534
    mov           r3, r2
535
    mov           r2, r1
536
    mov           r1, r0
537
    mov           r0, r3
538
%else
539
    mov          r0m, r2
540
    mov          r1m, r0
541
    mov          r2m, r1
542
%endif
543
%ifdef WIN64
544
    call put_signed_pixels_clamped_mmx
545
    RET
546
%else
547
    jmp put_signed_pixels_clamped_mmx
548
%endif
549

    
550
cglobal vp3_idct_add_%1, 3, %3, %2
551
    VP3_IDCT_%1   r2
552
%ifdef ARCH_X86_64
553
    mov           r3, r2
554
    mov           r2, r1
555
    mov           r1, r0
556
    mov           r0, r3
557
%else
558
    mov          r0m, r2
559
    mov          r1m, r0
560
    mov          r2m, r1
561
%endif
562
%ifdef WIN64
563
    call add_pixels_clamped_mmx
564
    RET
565
%else
566
    jmp add_pixels_clamped_mmx
567
%endif
568
%endmacro
569

    
570
%ifdef ARCH_X86_64
571
%define REGS 4
572
%else
573
%define REGS 3
574
%endif
575
INIT_MMX
576
vp3_idct_funcs mmx,  0, REGS
577
INIT_XMM
578
vp3_idct_funcs sse2, 9, REGS
579
%undef REGS
580

    
581
%macro DC_ADD 0
582
    movq          m2, [r0     ]
583
    movq          m3, [r0+r1  ]
584
    paddusb       m2, m0
585
    movq          m4, [r0+r1*2]
586
    paddusb       m3, m0
587
    movq          m5, [r0+r3  ]
588
    paddusb       m4, m0
589
    paddusb       m5, m0
590
    psubusb       m2, m1
591
    psubusb       m3, m1
592
    movq   [r0     ], m2
593
    psubusb       m4, m1
594
    movq   [r0+r1  ], m3
595
    psubusb       m5, m1
596
    movq   [r0+r1*2], m4
597
    movq   [r0+r3  ], m5
598
%endmacro
599

    
600
INIT_MMX
601
cglobal vp3_idct_dc_add_mmx2, 3, 4
602
%ifdef ARCH_X86_64
603
    movsxd        r1, r1d
604
%endif
605
    lea           r3, [r1*3]
606
    movsx         r2, word [r2]
607
    add           r2, 15
608
    sar           r2, 5
609
    movd          m0, r2d
610
    pshufw        m0, m0, 0x0
611
    pxor          m1, m1
612
    psubw         m1, m0
613
    packuswb      m0, m0
614
    packuswb      m1, m1
615
    DC_ADD
616
    lea           r0, [r0+r1*4]
617
    DC_ADD
618
    RET