ffmpeg / libavcodec / x86 / h264_idct.asm @ 888fa31e
History | View | Annotate | Download (24.6 KB)
1 | 1d16a1cf | Ronald S. Bultje | ;***************************************************************************** |
---|---|---|---|
2 | ;* MMX/SSE2-optimized H.264 iDCT |
||
3 | ;***************************************************************************** |
||
4 | ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt |
||
5 | ;* Copyright (C) 2003-2008 x264 project |
||
6 | ;* |
||
7 | ;* Authors: Laurent Aimar <fenrir@via.ecp.fr> |
||
8 | ;* Loren Merritt <lorenm@u.washington.edu> |
||
9 | ;* Holger Lubitz <hal@duncan.ol.sub.de> |
||
10 | ;* Min Chen <chenm001.163.com> |
||
11 | ;* |
||
12 | 2912e87a | Mans Rullgard | ;* This file is part of Libav. |
13 | 1d16a1cf | Ronald S. Bultje | ;* |
14 | 2912e87a | Mans Rullgard | ;* Libav is free software; you can redistribute it and/or |
15 | 1d16a1cf | Ronald S. Bultje | ;* modify it under the terms of the GNU Lesser General Public |
16 | ;* License as published by the Free Software Foundation; either |
||
17 | ;* version 2.1 of the License, or (at your option) any later version. |
||
18 | ;* |
||
19 | 2912e87a | Mans Rullgard | ;* Libav is distributed in the hope that it will be useful, |
20 | 1d16a1cf | Ronald S. Bultje | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
21 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
22 | ;* Lesser General Public License for more details. |
||
23 | ;* |
||
24 | ;* You should have received a copy of the GNU Lesser General Public |
||
25 | 2912e87a | Mans Rullgard | ;* License along with Libav; if not, write to the Free Software |
26 | 888fa31e | Diego Biurrun | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
27 | 1d16a1cf | Ronald S. Bultje | ;***************************************************************************** |
28 | |||
29 | %include "x86inc.asm" |
||
30 | %include "x86util.asm" |
||
31 | |||
32 | SECTION_RODATA |
||
33 | |||
34 | ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split |
||
35 | scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8 |
||
36 | db 6+1*8, 7+1*8, 6+2*8, 7+2*8 |
||
37 | db 4+3*8, 5+3*8, 4+4*8, 5+4*8 |
||
38 | db 6+3*8, 7+3*8, 6+4*8, 7+4*8 |
||
39 | db 1+1*8, 2+1*8 |
||
40 | db 1+2*8, 2+2*8 |
||
41 | db 1+4*8, 2+4*8 |
||
42 | db 1+5*8, 2+5*8 |
||
43 | %ifdef PIC |
||
44 | %define scan8 r11 |
||
45 | %else |
||
46 | %define scan8 scan8_mem |
||
47 | %endif |
||
48 | |||
49 | cextern pw_32 |
||
50 | 19fb234e | Jason Garrett-Glaser | cextern pw_1 |
51 | 1d16a1cf | Ronald S. Bultje | |
52 | SECTION .text |
||
53 | |||
54 | ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride |
||
55 | %macro IDCT4_ADD 3 |
||
56 | ; Load dct coeffs |
||
57 | movq m0, [%2] |
||
58 | movq m1, [%2+8] |
||
59 | movq m2, [%2+16] |
||
60 | movq m3, [%2+24] |
||
61 | |||
62 | IDCT4_1D 0, 1, 2, 3, 4, 5 |
||
63 | mova m6, [pw_32] |
||
64 | TRANSPOSE4x4W 0, 1, 2, 3, 4 |
||
65 | paddw m0, m6 |
||
66 | IDCT4_1D 0, 1, 2, 3, 4, 5 |
||
67 | pxor m7, m7 |
||
68 | |||
69 | STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 |
||
70 | lea %1, [%1+%3*2] |
||
71 | STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3 |
||
72 | %endmacro |
||
73 | |||
74 | INIT_MMX |
||
75 | ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) |
||
76 | cglobal h264_idct_add_mmx, 3, 3, 0 |
||
77 | IDCT4_ADD r0, r1, r2 |
||
78 | RET |
||
79 | |||
80 | %macro IDCT8_1D 2 |
||
81 | mova m4, m5 |
||
82 | mova m0, m1 |
||
83 | psraw m4, 1 |
||
84 | psraw m1, 1 |
||
85 | paddw m4, m5 |
||
86 | paddw m1, m0 |
||
87 | paddw m4, m7 |
||
88 | paddw m1, m5 |
||
89 | psubw m4, m0 |
||
90 | paddw m1, m3 |
||
91 | |||
92 | psubw m0, m3 |
||
93 | psubw m5, m3 |
||
94 | paddw m0, m7 |
||
95 | psubw m5, m7 |
||
96 | psraw m3, 1 |
||
97 | psraw m7, 1 |
||
98 | psubw m0, m3 |
||
99 | psubw m5, m7 |
||
100 | |||
101 | mova m3, m4 |
||
102 | mova m7, m1 |
||
103 | psraw m1, 2 |
||
104 | psraw m3, 2 |
||
105 | paddw m3, m0 |
||
106 | psraw m0, 2 |
||
107 | paddw m1, m5 |
||
108 | psraw m5, 2 |
||
109 | psubw m0, m4 |
||
110 | psubw m7, m5 |
||
111 | |||
112 | mova m4, m2 |
||
113 | mova m5, m6 |
||
114 | psraw m4, 1 |
||
115 | psraw m6, 1 |
||
116 | psubw m4, m5 |
||
117 | paddw m6, m2 |
||
118 | |||
119 | mova m2, %1 |
||
120 | mova m5, %2 |
||
121 | SUMSUB_BA m5, m2 |
||
122 | SUMSUB_BA m6, m5 |
||
123 | SUMSUB_BA m4, m2 |
||
124 | SUMSUB_BA m7, m6 |
||
125 | SUMSUB_BA m0, m4 |
||
126 | SUMSUB_BA m3, m2 |
||
127 | SUMSUB_BA m1, m5 |
||
128 | SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 |
||
129 | %endmacro |
||
130 | |||
131 | %macro IDCT8_1D_FULL 1 |
||
132 | mova m7, [%1+112] |
||
133 | mova m6, [%1+ 96] |
||
134 | mova m5, [%1+ 80] |
||
135 | mova m3, [%1+ 48] |
||
136 | mova m2, [%1+ 32] |
||
137 | mova m1, [%1+ 16] |
||
138 | IDCT8_1D [%1], [%1+ 64] |
||
139 | %endmacro |
||
140 | |||
141 | ; %1=int16_t *block, %2=int16_t *dstblock |
||
142 | %macro IDCT8_ADD_MMX_START 2 |
||
143 | IDCT8_1D_FULL %1 |
||
144 | mova [%1], m7 |
||
145 | TRANSPOSE4x4W 0, 1, 2, 3, 7 |
||
146 | mova m7, [%1] |
||
147 | mova [%2 ], m0 |
||
148 | mova [%2+16], m1 |
||
149 | mova [%2+32], m2 |
||
150 | mova [%2+48], m3 |
||
151 | TRANSPOSE4x4W 4, 5, 6, 7, 3 |
||
152 | mova [%2+ 8], m4 |
||
153 | mova [%2+24], m5 |
||
154 | mova [%2+40], m6 |
||
155 | mova [%2+56], m7 |
||
156 | %endmacro |
||
157 | |||
158 | ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride |
||
159 | %macro IDCT8_ADD_MMX_END 3 |
||
160 | IDCT8_1D_FULL %2 |
||
161 | mova [%2 ], m5 |
||
162 | mova [%2+16], m6 |
||
163 | mova [%2+32], m7 |
||
164 | |||
165 | pxor m7, m7 |
||
166 | STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 |
||
167 | lea %1, [%1+%3*2] |
||
168 | STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 |
||
169 | mova m0, [%2 ] |
||
170 | mova m1, [%2+16] |
||
171 | mova m2, [%2+32] |
||
172 | lea %1, [%1+%3*2] |
||
173 | STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3 |
||
174 | lea %1, [%1+%3*2] |
||
175 | STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3 |
||
176 | %endmacro |
||
177 | |||
178 | INIT_MMX |
||
179 | ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) |
||
180 | cglobal h264_idct8_add_mmx, 3, 4, 0 |
||
181 | %assign pad 128+4-(stack_offset&7) |
||
182 | SUB rsp, pad |
||
183 | |||
184 | add word [r1], 32 |
||
185 | IDCT8_ADD_MMX_START r1 , rsp |
||
186 | IDCT8_ADD_MMX_START r1+8, rsp+64 |
||
187 | lea r3, [r0+4] |
||
188 | IDCT8_ADD_MMX_END r0 , rsp, r2 |
||
189 | IDCT8_ADD_MMX_END r3 , rsp+8, r2 |
||
190 | |||
191 | ADD rsp, pad |
||
192 | RET |
||
193 | |||
194 | ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride |
||
195 | %macro IDCT8_ADD_SSE 4 |
||
196 | IDCT8_1D_FULL %2 |
||
197 | %ifdef ARCH_X86_64 |
||
198 | TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 |
||
199 | %else |
||
200 | TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16] |
||
201 | %endif |
||
202 | paddw m0, [pw_32] |
||
203 | |||
204 | %ifndef ARCH_X86_64 |
||
205 | mova [%2 ], m0 |
||
206 | mova [%2+16], m4 |
||
207 | IDCT8_1D [%2], [%2+ 16] |
||
208 | mova [%2 ], m6 |
||
209 | mova [%2+16], m7 |
||
210 | %else |
||
211 | SWAP 0, 8 |
||
212 | SWAP 4, 9 |
||
213 | IDCT8_1D m8, m9 |
||
214 | SWAP 6, 8 |
||
215 | SWAP 7, 9 |
||
216 | %endif |
||
217 | |||
218 | pxor m7, m7 |
||
219 | lea %4, [%3*3] |
||
220 | STORE_DIFF m0, m6, m7, [%1 ] |
||
221 | STORE_DIFF m1, m6, m7, [%1+%3 ] |
||
222 | STORE_DIFF m2, m6, m7, [%1+%3*2] |
||
223 | STORE_DIFF m3, m6, m7, [%1+%4 ] |
||
224 | %ifndef ARCH_X86_64 |
||
225 | mova m0, [%2 ] |
||
226 | mova m1, [%2+16] |
||
227 | %else |
||
228 | SWAP 0, 8 |
||
229 | SWAP 1, 9 |
||
230 | %endif |
||
231 | lea %1, [%1+%3*4] |
||
232 | STORE_DIFF m4, m6, m7, [%1 ] |
||
233 | STORE_DIFF m5, m6, m7, [%1+%3 ] |
||
234 | STORE_DIFF m0, m6, m7, [%1+%3*2] |
||
235 | STORE_DIFF m1, m6, m7, [%1+%4 ] |
||
236 | %endmacro |
||
237 | |||
238 | INIT_XMM |
||
239 | ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) |
||
240 | cglobal h264_idct8_add_sse2, 3, 4, 10 |
||
241 | IDCT8_ADD_SSE r0, r1, r2, r3 |
||
242 | RET |
||
243 | |||
244 | %macro DC_ADD_MMX2_INIT 2-3 |
||
245 | %if %0 == 2 |
||
246 | movsx %1, word [%1] |
||
247 | add %1, 32 |
||
248 | sar %1, 6 |
||
249 | 02b424d9 | Reimar Döffinger | movd m0, %1d |
250 | 1d16a1cf | Ronald S. Bultje | lea %1, [%2*3] |
251 | %else |
||
252 | add %3, 32 |
||
253 | sar %3, 6 |
||
254 | 02b424d9 | Reimar Döffinger | movd m0, %3d |
255 | 1d16a1cf | Ronald S. Bultje | lea %3, [%2*3] |
256 | %endif |
||
257 | pshufw m0, m0, 0 |
||
258 | pxor m1, m1 |
||
259 | psubw m1, m0 |
||
260 | packuswb m0, m0 |
||
261 | packuswb m1, m1 |
||
262 | %endmacro |
||
263 | |||
264 | %macro DC_ADD_MMX2_OP 3-4 |
||
265 | %1 m2, [%2 ] |
||
266 | %1 m3, [%2+%3 ] |
||
267 | %1 m4, [%2+%3*2] |
||
268 | %1 m5, [%2+%4 ] |
||
269 | paddusb m2, m0 |
||
270 | paddusb m3, m0 |
||
271 | paddusb m4, m0 |
||
272 | paddusb m5, m0 |
||
273 | psubusb m2, m1 |
||
274 | psubusb m3, m1 |
||
275 | psubusb m4, m1 |
||
276 | psubusb m5, m1 |
||
277 | %1 [%2 ], m2 |
||
278 | %1 [%2+%3 ], m3 |
||
279 | %1 [%2+%3*2], m4 |
||
280 | %1 [%2+%4 ], m5 |
||
281 | %endmacro |
||
282 | |||
283 | INIT_MMX |
||
284 | ; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) |
||
285 | cglobal h264_idct_dc_add_mmx2, 3, 3, 0 |
||
286 | DC_ADD_MMX2_INIT r1, r2 |
||
287 | DC_ADD_MMX2_OP movh, r0, r2, r1 |
||
288 | RET |
||
289 | |||
290 | ; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) |
||
291 | cglobal h264_idct8_dc_add_mmx2, 3, 3, 0 |
||
292 | DC_ADD_MMX2_INIT r1, r2 |
||
293 | DC_ADD_MMX2_OP mova, r0, r2, r1 |
||
294 | lea r0, [r0+r2*4] |
||
295 | DC_ADD_MMX2_OP mova, r0, r2, r1 |
||
296 | RET |
||
297 | |||
298 | ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, |
||
299 | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
||
300 | cglobal h264_idct_add16_mmx, 5, 7, 0 |
||
301 | xor r5, r5 |
||
302 | %ifdef PIC |
||
303 | lea r11, [scan8_mem] |
||
304 | %endif |
||
305 | .nextblock |
||
306 | movzx r6, byte [scan8+r5] |
||
307 | movzx r6, byte [r4+r6] |
||
308 | test r6, r6 |
||
309 | jz .skipblock |
||
310 | mov r6d, dword [r1+r5*4] |
||
311 | lea r6, [r0+r6] |
||
312 | IDCT4_ADD r6, r2, r3 |
||
313 | .skipblock |
||
314 | inc r5 |
||
315 | add r2, 32 |
||
316 | cmp r5, 16 |
||
317 | jl .nextblock |
||
318 | REP_RET |
||
319 | |||
320 | ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, |
||
321 | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
||
322 | cglobal h264_idct8_add4_mmx, 5, 7, 0 |
||
323 | %assign pad 128+4-(stack_offset&7) |
||
324 | SUB rsp, pad |
||
325 | |||
326 | xor r5, r5 |
||
327 | %ifdef PIC |
||
328 | lea r11, [scan8_mem] |
||
329 | %endif |
||
330 | .nextblock |
||
331 | movzx r6, byte [scan8+r5] |
||
332 | movzx r6, byte [r4+r6] |
||
333 | test r6, r6 |
||
334 | jz .skipblock |
||
335 | mov r6d, dword [r1+r5*4] |
||
336 | lea r6, [r0+r6] |
||
337 | add word [r2], 32 |
||
338 | IDCT8_ADD_MMX_START r2 , rsp |
||
339 | IDCT8_ADD_MMX_START r2+8, rsp+64 |
||
340 | IDCT8_ADD_MMX_END r6 , rsp, r3 |
||
341 | mov r6d, dword [r1+r5*4] |
||
342 | lea r6, [r0+r6+4] |
||
343 | IDCT8_ADD_MMX_END r6 , rsp+8, r3 |
||
344 | .skipblock |
||
345 | add r5, 4 |
||
346 | add r2, 128 |
||
347 | cmp r5, 16 |
||
348 | jl .nextblock |
||
349 | ADD rsp, pad |
||
350 | RET |
||
351 | |||
352 | ; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, |
||
353 | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
||
354 | cglobal h264_idct_add16_mmx2, 5, 7, 0 |
||
355 | xor r5, r5 |
||
356 | %ifdef PIC |
||
357 | lea r11, [scan8_mem] |
||
358 | %endif |
||
359 | .nextblock |
||
360 | movzx r6, byte [scan8+r5] |
||
361 | movzx r6, byte [r4+r6] |
||
362 | test r6, r6 |
||
363 | jz .skipblock |
||
364 | cmp r6, 1 |
||
365 | jnz .no_dc |
||
366 | movsx r6, word [r2] |
||
367 | test r6, r6 |
||
368 | jz .no_dc |
||
369 | DC_ADD_MMX2_INIT r2, r3, r6 |
||
370 | %ifdef ARCH_X86_64 |
||
371 | %define dst_reg r10 |
||
372 | %define dst_regd r10d |
||
373 | %else |
||
374 | %define dst_reg r1 |
||
375 | %define dst_regd r1d |
||
376 | %endif |
||
377 | mov dst_regd, dword [r1+r5*4] |
||
378 | lea dst_reg, [r0+dst_reg] |
||
379 | DC_ADD_MMX2_OP movh, dst_reg, r3, r6 |
||
380 | %ifndef ARCH_X86_64 |
||
381 | mov r1, r1m |
||
382 | %endif |
||
383 | inc r5 |
||
384 | add r2, 32 |
||
385 | cmp r5, 16 |
||
386 | jl .nextblock |
||
387 | REP_RET |
||
388 | .no_dc |
||
389 | mov r6d, dword [r1+r5*4] |
||
390 | lea r6, [r0+r6] |
||
391 | IDCT4_ADD r6, r2, r3 |
||
392 | .skipblock |
||
393 | inc r5 |
||
394 | add r2, 32 |
||
395 | cmp r5, 16 |
||
396 | jl .nextblock |
||
397 | REP_RET |
||
398 | |||
399 | ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, |
||
400 | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
||
401 | cglobal h264_idct_add16intra_mmx, 5, 7, 0 |
||
402 | xor r5, r5 |
||
403 | %ifdef PIC |
||
404 | lea r11, [scan8_mem] |
||
405 | %endif |
||
406 | .nextblock |
||
407 | movzx r6, byte [scan8+r5] |
||
408 | movzx r6, byte [r4+r6] |
||
409 | or r6w, word [r2] |
||
410 | test r6, r6 |
||
411 | jz .skipblock |
||
412 | mov r6d, dword [r1+r5*4] |
||
413 | lea r6, [r0+r6] |
||
414 | IDCT4_ADD r6, r2, r3 |
||
415 | .skipblock |
||
416 | inc r5 |
||
417 | add r2, 32 |
||
418 | cmp r5, 16 |
||
419 | jl .nextblock |
||
420 | REP_RET |
||
421 | |||
422 | ; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, |
||
423 | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
||
424 | cglobal h264_idct_add16intra_mmx2, 5, 7, 0 |
||
425 | xor r5, r5 |
||
426 | %ifdef PIC |
||
427 | lea r11, [scan8_mem] |
||
428 | %endif |
||
429 | .nextblock |
||
430 | movzx r6, byte [scan8+r5] |
||
431 | movzx r6, byte [r4+r6] |
||
432 | test r6, r6 |
||
433 | jz .try_dc |
||
434 | mov r6d, dword [r1+r5*4] |
||
435 | lea r6, [r0+r6] |
||
436 | IDCT4_ADD r6, r2, r3 |
||
437 | inc r5 |
||
438 | add r2, 32 |
||
439 | cmp r5, 16 |
||
440 | jl .nextblock |
||
441 | REP_RET |
||
442 | .try_dc |
||
443 | movsx r6, word [r2] |
||
444 | test r6, r6 |
||
445 | jz .skipblock |
||
446 | DC_ADD_MMX2_INIT r2, r3, r6 |
||
447 | %ifdef ARCH_X86_64 |
||
448 | %define dst_reg r10 |
||
449 | %define dst_regd r10d |
||
450 | %else |
||
451 | %define dst_reg r1 |
||
452 | %define dst_regd r1d |
||
453 | %endif |
||
454 | mov dst_regd, dword [r1+r5*4] |
||
455 | lea dst_reg, [r0+dst_reg] |
||
456 | DC_ADD_MMX2_OP movh, dst_reg, r3, r6 |
||
457 | %ifndef ARCH_X86_64 |
||
458 | mov r1, r1m |
||
459 | %endif |
||
460 | .skipblock |
||
461 | inc r5 |
||
462 | add r2, 32 |
||
463 | cmp r5, 16 |
||
464 | jl .nextblock |
||
465 | REP_RET |
||
466 | |||
467 | ; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, |
||
468 | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
||
469 | cglobal h264_idct8_add4_mmx2, 5, 7, 0 |
||
470 | %assign pad 128+4-(stack_offset&7) |
||
471 | SUB rsp, pad |
||
472 | |||
473 | xor r5, r5 |
||
474 | %ifdef PIC |
||
475 | lea r11, [scan8_mem] |
||
476 | %endif |
||
477 | .nextblock |
||
478 | movzx r6, byte [scan8+r5] |
||
479 | movzx r6, byte [r4+r6] |
||
480 | test r6, r6 |
||
481 | jz .skipblock |
||
482 | cmp r6, 1 |
||
483 | jnz .no_dc |
||
484 | movsx r6, word [r2] |
||
485 | test r6, r6 |
||
486 | jz .no_dc |
||
487 | DC_ADD_MMX2_INIT r2, r3, r6 |
||
488 | %ifdef ARCH_X86_64 |
||
489 | %define dst_reg r10 |
||
490 | %define dst_regd r10d |
||
491 | %else |
||
492 | %define dst_reg r1 |
||
493 | %define dst_regd r1d |
||
494 | %endif |
||
495 | mov dst_regd, dword [r1+r5*4] |
||
496 | lea dst_reg, [r0+dst_reg] |
||
497 | DC_ADD_MMX2_OP mova, dst_reg, r3, r6 |
||
498 | lea dst_reg, [dst_reg+r3*4] |
||
499 | DC_ADD_MMX2_OP mova, dst_reg, r3, r6 |
||
500 | %ifndef ARCH_X86_64 |
||
501 | mov r1, r1m |
||
502 | %endif |
||
503 | add r5, 4 |
||
504 | add r2, 128 |
||
505 | cmp r5, 16 |
||
506 | jl .nextblock |
||
507 | |||
508 | ADD rsp, pad |
||
509 | RET |
||
510 | .no_dc |
||
511 | mov r6d, dword [r1+r5*4] |
||
512 | lea r6, [r0+r6] |
||
513 | add word [r2], 32 |
||
514 | IDCT8_ADD_MMX_START r2 , rsp |
||
515 | IDCT8_ADD_MMX_START r2+8, rsp+64 |
||
516 | IDCT8_ADD_MMX_END r6 , rsp, r3 |
||
517 | mov r6d, dword [r1+r5*4] |
||
518 | lea r6, [r0+r6+4] |
||
519 | IDCT8_ADD_MMX_END r6 , rsp+8, r3 |
||
520 | .skipblock |
||
521 | add r5, 4 |
||
522 | add r2, 128 |
||
523 | cmp r5, 16 |
||
524 | jl .nextblock |
||
525 | |||
526 | ADD rsp, pad |
||
527 | RET |
||
528 | |||
529 | INIT_XMM |
||
530 | ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, |
||
531 | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
||
532 | cglobal h264_idct8_add4_sse2, 5, 7, 10 |
||
533 | xor r5, r5 |
||
534 | %ifdef PIC |
||
535 | lea r11, [scan8_mem] |
||
536 | %endif |
||
537 | .nextblock |
||
538 | movzx r6, byte [scan8+r5] |
||
539 | movzx r6, byte [r4+r6] |
||
540 | test r6, r6 |
||
541 | jz .skipblock |
||
542 | cmp r6, 1 |
||
543 | jnz .no_dc |
||
544 | movsx r6, word [r2] |
||
545 | test r6, r6 |
||
546 | jz .no_dc |
||
547 | INIT_MMX |
||
548 | DC_ADD_MMX2_INIT r2, r3, r6 |
||
549 | %ifdef ARCH_X86_64 |
||
550 | %define dst_reg r10 |
||
551 | %define dst_regd r10d |
||
552 | %else |
||
553 | %define dst_reg r1 |
||
554 | %define dst_regd r1d |
||
555 | %endif |
||
556 | mov dst_regd, dword [r1+r5*4] |
||
557 | lea dst_reg, [r0+dst_reg] |
||
558 | DC_ADD_MMX2_OP mova, dst_reg, r3, r6 |
||
559 | lea dst_reg, [dst_reg+r3*4] |
||
560 | DC_ADD_MMX2_OP mova, dst_reg, r3, r6 |
||
561 | %ifndef ARCH_X86_64 |
||
562 | mov r1, r1m |
||
563 | %endif |
||
564 | add r5, 4 |
||
565 | add r2, 128 |
||
566 | cmp r5, 16 |
||
567 | jl .nextblock |
||
568 | REP_RET |
||
569 | .no_dc |
||
570 | INIT_XMM |
||
571 | mov dst_regd, dword [r1+r5*4] |
||
572 | lea dst_reg, [r0+dst_reg] |
||
573 | IDCT8_ADD_SSE dst_reg, r2, r3, r6 |
||
574 | %ifndef ARCH_X86_64 |
||
575 | mov r1, r1m |
||
576 | %endif |
||
577 | .skipblock |
||
578 | add r5, 4 |
||
579 | add r2, 128 |
||
580 | cmp r5, 16 |
||
581 | jl .nextblock |
||
582 | REP_RET |
||
583 | |||
584 | INIT_MMX |
||
585 | h264_idct_add8_mmx_plane: |
||
586 | .nextblock |
||
587 | movzx r6, byte [scan8+r5] |
||
588 | movzx r6, byte [r4+r6] |
||
589 | or r6w, word [r2] |
||
590 | test r6, r6 |
||
591 | jz .skipblock |
||
592 | %ifdef ARCH_X86_64 |
||
593 | mov r0d, dword [r1+r5*4] |
||
594 | add r0, [r10] |
||
595 | %else |
||
596 | mov r0, r1m ; XXX r1m here is actually r0m of the calling func |
||
597 | mov r0, [r0] |
||
598 | add r0, dword [r1+r5*4] |
||
599 | %endif |
||
600 | IDCT4_ADD r0, r2, r3 |
||
601 | .skipblock |
||
602 | inc r5 |
||
603 | add r2, 32 |
||
604 | test r5, 3 |
||
605 | jnz .nextblock |
||
606 | rep ret |
||
607 | |||
608 | ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, |
||
609 | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
||
610 | cglobal h264_idct_add8_mmx, 5, 7, 0 |
||
611 | mov r5, 16 |
||
612 | add r2, 512 |
||
613 | %ifdef PIC |
||
614 | lea r11, [scan8_mem] |
||
615 | %endif |
||
616 | %ifdef ARCH_X86_64 |
||
617 | mov r10, r0 |
||
618 | %endif |
||
619 | call h264_idct_add8_mmx_plane |
||
620 | %ifdef ARCH_X86_64 |
||
621 | add r10, gprsize |
||
622 | %else |
||
623 | add r0mp, gprsize |
||
624 | %endif |
||
625 | call h264_idct_add8_mmx_plane |
||
626 | RET |
||
627 | |||
628 | h264_idct_add8_mmx2_plane |
||
629 | .nextblock |
||
630 | movzx r6, byte [scan8+r5] |
||
631 | movzx r6, byte [r4+r6] |
||
632 | test r6, r6 |
||
633 | jz .try_dc |
||
634 | %ifdef ARCH_X86_64 |
||
635 | mov r0d, dword [r1+r5*4] |
||
636 | add r0, [r10] |
||
637 | %else |
||
638 | mov r0, r1m ; XXX r1m here is actually r0m of the calling func |
||
639 | mov r0, [r0] |
||
640 | add r0, dword [r1+r5*4] |
||
641 | %endif |
||
642 | IDCT4_ADD r0, r2, r3 |
||
643 | inc r5 |
||
644 | add r2, 32 |
||
645 | test r5, 3 |
||
646 | jnz .nextblock |
||
647 | rep ret |
||
648 | .try_dc |
||
649 | movsx r6, word [r2] |
||
650 | test r6, r6 |
||
651 | jz .skipblock |
||
652 | DC_ADD_MMX2_INIT r2, r3, r6 |
||
653 | %ifdef ARCH_X86_64 |
||
654 | mov r0d, dword [r1+r5*4] |
||
655 | add r0, [r10] |
||
656 | %else |
||
657 | mov r0, r1m ; XXX r1m here is actually r0m of the calling func |
||
658 | mov r0, [r0] |
||
659 | add r0, dword [r1+r5*4] |
||
660 | %endif |
||
661 | DC_ADD_MMX2_OP movh, r0, r3, r6 |
||
662 | .skipblock |
||
663 | inc r5 |
||
664 | add r2, 32 |
||
665 | test r5, 3 |
||
666 | jnz .nextblock |
||
667 | rep ret |
||
668 | |||
669 | ; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, |
||
670 | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
||
671 | cglobal h264_idct_add8_mmx2, 5, 7, 0 |
||
672 | mov r5, 16 |
||
673 | add r2, 512 |
||
674 | %ifdef ARCH_X86_64 |
||
675 | mov r10, r0 |
||
676 | %endif |
||
677 | %ifdef PIC |
||
678 | lea r11, [scan8_mem] |
||
679 | %endif |
||
680 | call h264_idct_add8_mmx2_plane |
||
681 | %ifdef ARCH_X86_64 |
||
682 | add r10, gprsize |
||
683 | %else |
||
684 | add r0mp, gprsize |
||
685 | %endif |
||
686 | call h264_idct_add8_mmx2_plane |
||
687 | RET |
||
688 | |||
689 | INIT_MMX |
||
690 | ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered |
||
691 | h264_idct_dc_add8_mmx2: |
||
692 | movd m0, [r2 ] ; 0 0 X D |
||
693 | punpcklwd m0, [r2+32] ; x X d D |
||
694 | paddsw m0, [pw_32] |
||
695 | psraw m0, 6 |
||
696 | punpcklwd m0, m0 ; d d D D |
||
697 | pxor m1, m1 ; 0 0 0 0 |
||
698 | psubw m1, m0 ; -d-d-D-D |
||
699 | packuswb m0, m1 ; -d-d-D-D d d D D |
||
700 | pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D |
||
701 | punpcklwd m0, m0 ; d d d d D D D D |
||
702 | lea r6, [r3*3] |
||
703 | DC_ADD_MMX2_OP movq, r0, r3, r6 |
||
704 | ret |
||
705 | |||
706 | ALIGN 16 |
||
707 | INIT_XMM |
||
708 | ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride |
||
709 | x264_add8x4_idct_sse2: |
||
710 | movq m0, [r2+ 0] |
||
711 | movq m1, [r2+ 8] |
||
712 | movq m2, [r2+16] |
||
713 | movq m3, [r2+24] |
||
714 | movhps m0, [r2+32] |
||
715 | movhps m1, [r2+40] |
||
716 | movhps m2, [r2+48] |
||
717 | movhps m3, [r2+56] |
||
718 | IDCT4_1D 0,1,2,3,4,5 |
||
719 | TRANSPOSE2x4x4W 0,1,2,3,4 |
||
720 | paddw m0, [pw_32] |
||
721 | IDCT4_1D 0,1,2,3,4,5 |
||
722 | pxor m7, m7 |
||
723 | STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 |
||
724 | lea r0, [r0+r3*2] |
||
725 | STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 |
||
726 | ret |
||
727 | |||
728 | %macro add16_sse2_cycle 2 |
||
729 | movzx r0, word [r4+%2] |
||
730 | test r0, r0 |
||
731 | jz .cycle%1end |
||
732 | mov r0d, dword [r1+%1*8] |
||
733 | %ifdef ARCH_X86_64 |
||
734 | add r0, r10 |
||
735 | %else |
||
736 | add r0, r0m |
||
737 | %endif |
||
738 | call x264_add8x4_idct_sse2 |
||
739 | .cycle%1end |
||
740 | %if %1 < 7 |
||
741 | add r2, 64 |
||
742 | %endif |
||
743 | %endmacro |
||
744 | |||
745 | ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, |
||
746 | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
||
747 | cglobal h264_idct_add16_sse2, 5, 5, 8 |
||
748 | %ifdef ARCH_X86_64 |
||
749 | mov r10, r0 |
||
750 | %endif |
||
751 | ; unrolling of the loop leads to an average performance gain of |
||
752 | ; 20-25% |
||
753 | add16_sse2_cycle 0, 0xc |
||
754 | add16_sse2_cycle 1, 0x14 |
||
755 | add16_sse2_cycle 2, 0xe |
||
756 | add16_sse2_cycle 3, 0x16 |
||
757 | add16_sse2_cycle 4, 0x1c |
||
758 | add16_sse2_cycle 5, 0x24 |
||
759 | add16_sse2_cycle 6, 0x1e |
||
760 | add16_sse2_cycle 7, 0x26 |
||
761 | RET |
||
762 | |||
763 | ae112918 | Ronald S. Bultje | %macro add16intra_sse2_cycle 2 |
764 | movzx r0, word [r4+%2] |
||
765 | 1d16a1cf | Ronald S. Bultje | test r0, r0 |
766 | ae112918 | Ronald S. Bultje | jz .try%1dc |
767 | mov r0d, dword [r1+%1*8] |
||
768 | 1d16a1cf | Ronald S. Bultje | %ifdef ARCH_X86_64 |
769 | add r0, r10 |
||
770 | %else |
||
771 | add r0, r0m |
||
772 | %endif |
||
773 | call x264_add8x4_idct_sse2 |
||
774 | ae112918 | Ronald S. Bultje | jmp .cycle%1end |
775 | .try%1dc |
||
776 | 1d16a1cf | Ronald S. Bultje | movsx r0, word [r2 ] |
777 | or r0w, word [r2+32] |
||
778 | ae112918 | Ronald S. Bultje | jz .cycle%1end |
779 | mov r0d, dword [r1+%1*8] |
||
780 | 1d16a1cf | Ronald S. Bultje | %ifdef ARCH_X86_64 |
781 | add r0, r10 |
||
782 | %else |
||
783 | add r0, r0m |
||
784 | %endif |
||
785 | call h264_idct_dc_add8_mmx2 |
||
786 | ae112918 | Ronald S. Bultje | .cycle%1end |
787 | %if %1 < 7 |
||
788 | 1d16a1cf | Ronald S. Bultje | add r2, 64 |
789 | ae112918 | Ronald S. Bultje | %endif |
790 | %endmacro |
||
791 | |||
792 | ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, |
||
793 | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
||
794 | cglobal h264_idct_add16intra_sse2, 5, 7, 8 |
||
795 | %ifdef ARCH_X86_64 |
||
796 | mov r10, r0 |
||
797 | %endif |
||
798 | add16intra_sse2_cycle 0, 0xc |
||
799 | add16intra_sse2_cycle 1, 0x14 |
||
800 | add16intra_sse2_cycle 2, 0xe |
||
801 | add16intra_sse2_cycle 3, 0x16 |
||
802 | add16intra_sse2_cycle 4, 0x1c |
||
803 | add16intra_sse2_cycle 5, 0x24 |
||
804 | add16intra_sse2_cycle 6, 0x1e |
||
805 | add16intra_sse2_cycle 7, 0x26 |
||
806 | RET |
||
807 | 1d16a1cf | Ronald S. Bultje | |
808 | 4bca6774 | Ronald S. Bultje | %macro add8_sse2_cycle 2 |
809 | movzx r0, word [r4+%2] |
||
810 | 1d16a1cf | Ronald S. Bultje | test r0, r0 |
811 | 4bca6774 | Ronald S. Bultje | jz .try%1dc |
812 | 1d16a1cf | Ronald S. Bultje | %ifdef ARCH_X86_64 |
813 | 4bca6774 | Ronald S. Bultje | mov r0d, dword [r1+%1*8+64] |
814 | 1d16a1cf | Ronald S. Bultje | add r0, [r10] |
815 | %else |
||
816 | 4bca6774 | Ronald S. Bultje | mov r0, r0m |
817 | 1d16a1cf | Ronald S. Bultje | mov r0, [r0] |
818 | 4bca6774 | Ronald S. Bultje | add r0, dword [r1+%1*8+64] |
819 | 1d16a1cf | Ronald S. Bultje | %endif |
820 | call x264_add8x4_idct_sse2 |
||
821 | 4bca6774 | Ronald S. Bultje | jmp .cycle%1end |
822 | .try%1dc |
||
823 | 1d16a1cf | Ronald S. Bultje | movsx r0, word [r2 ] |
824 | or r0w, word [r2+32] |
||
825 | 4bca6774 | Ronald S. Bultje | jz .cycle%1end |
826 | 1d16a1cf | Ronald S. Bultje | %ifdef ARCH_X86_64 |
827 | 4bca6774 | Ronald S. Bultje | mov r0d, dword [r1+%1*8+64] |
828 | 1d16a1cf | Ronald S. Bultje | add r0, [r10] |
829 | %else |
||
830 | 4bca6774 | Ronald S. Bultje | mov r0, r0m |
831 | 1d16a1cf | Ronald S. Bultje | mov r0, [r0] |
832 | 4bca6774 | Ronald S. Bultje | add r0, dword [r1+%1*8+64] |
833 | 1d16a1cf | Ronald S. Bultje | %endif |
834 | call h264_idct_dc_add8_mmx2 |
||
835 | 4bca6774 | Ronald S. Bultje | .cycle%1end |
836 | %if %1 < 3 |
||
837 | 1d16a1cf | Ronald S. Bultje | add r2, 64 |
838 | 4bca6774 | Ronald S. Bultje | %endif |
839 | %endmacro |
||
840 | 1d16a1cf | Ronald S. Bultje | |
841 | ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, |
||
842 | ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
||
843 | cglobal h264_idct_add8_sse2, 5, 7, 8 |
||
844 | add r2, 512 |
||
845 | %ifdef ARCH_X86_64 |
||
846 | mov r10, r0 |
||
847 | %endif |
||
848 | 4bca6774 | Ronald S. Bultje | add8_sse2_cycle 0, 0x09 |
849 | add8_sse2_cycle 1, 0x11 |
||
850 | 1d16a1cf | Ronald S. Bultje | %ifdef ARCH_X86_64 |
851 | add r10, gprsize |
||
852 | %else |
||
853 | add r0mp, gprsize |
||
854 | %endif |
||
855 | 4bca6774 | Ronald S. Bultje | add8_sse2_cycle 2, 0x21 |
856 | add8_sse2_cycle 3, 0x29 |
||
857 | 1d16a1cf | Ronald S. Bultje | RET |
858 | 19fb234e | Jason Garrett-Glaser | |
859 | ;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul) |
||
860 | |||
861 | %macro WALSH4_1D 5 |
||
862 | SUMSUB_BADC m%4, m%3, m%2, m%1, m%5 |
||
863 | SUMSUB_BADC m%4, m%2, m%3, m%1, m%5 |
||
864 | SWAP %1, %4, %3 |
||
865 | %endmacro |
||
866 | |||
867 | %macro DEQUANT_MMX 3 |
||
868 | mova m7, [pw_1] |
||
869 | mova m4, %1 |
||
870 | punpcklwd %1, m7 |
||
871 | punpckhwd m4, m7 |
||
872 | mova m5, %2 |
||
873 | punpcklwd %2, m7 |
||
874 | punpckhwd m5, m7 |
||
875 | movd m7, t3d |
||
876 | punpckldq m7, m7 |
||
877 | pmaddwd %1, m7 |
||
878 | pmaddwd %2, m7 |
||
879 | pmaddwd m4, m7 |
||
880 | pmaddwd m5, m7 |
||
881 | psrad %1, %3 |
||
882 | psrad %2, %3 |
||
883 | psrad m4, %3 |
||
884 | psrad m5, %3 |
||
885 | packssdw %1, m4 |
||
886 | packssdw %2, m5 |
||
887 | %endmacro |
||
888 | |||
889 | %macro STORE_WORDS_MMX 5 |
||
890 | movd t0d, %1 |
||
891 | psrlq %1, 32 |
||
892 | movd t1d, %1 |
||
893 | mov [t2+%2*32], t0w |
||
894 | mov [t2+%4*32], t1w |
||
895 | shr t0d, 16 |
||
896 | shr t1d, 16 |
||
897 | mov [t2+%3*32], t0w |
||
898 | mov [t2+%5*32], t1w |
||
899 | %endmacro |
||
900 | |||
901 | %macro DEQUANT_STORE_MMX 1 |
||
902 | DEQUANT_MMX m0, m1, %1 |
||
903 | STORE_WORDS_MMX m0, 0, 1, 4, 5 |
||
904 | STORE_WORDS_MMX m1, 2, 3, 6, 7 |
||
905 | |||
906 | DEQUANT_MMX m2, m3, %1 |
||
907 | STORE_WORDS_MMX m2, 8, 9, 12, 13 |
||
908 | STORE_WORDS_MMX m3, 10, 11, 14, 15 |
||
909 | %endmacro |
||
910 | |||
911 | %macro STORE_WORDS_SSE 9 |
||
912 | movd t0d, %1 |
||
913 | psrldq %1, 4 |
||
914 | movd t1d, %1 |
||
915 | psrldq %1, 4 |
||
916 | mov [t2+%2*32], t0w |
||
917 | mov [t2+%4*32], t1w |
||
918 | shr t0d, 16 |
||
919 | shr t1d, 16 |
||
920 | mov [t2+%3*32], t0w |
||
921 | mov [t2+%5*32], t1w |
||
922 | movd t0d, %1 |
||
923 | psrldq %1, 4 |
||
924 | movd t1d, %1 |
||
925 | mov [t2+%6*32], t0w |
||
926 | mov [t2+%8*32], t1w |
||
927 | shr t0d, 16 |
||
928 | shr t1d, 16 |
||
929 | mov [t2+%7*32], t0w |
||
930 | mov [t2+%9*32], t1w |
||
931 | %endmacro |
||
932 | |||
933 | %macro DEQUANT_STORE_SSE2 1 |
||
934 | movd xmm4, t3d |
||
935 | movq xmm5, [pw_1] |
||
936 | pshufd xmm4, xmm4, 0 |
||
937 | movq2dq xmm0, m0 |
||
938 | movq2dq xmm1, m1 |
||
939 | movq2dq xmm2, m2 |
||
940 | movq2dq xmm3, m3 |
||
941 | punpcklwd xmm0, xmm5 |
||
942 | punpcklwd xmm1, xmm5 |
||
943 | punpcklwd xmm2, xmm5 |
||
944 | punpcklwd xmm3, xmm5 |
||
945 | pmaddwd xmm0, xmm4 |
||
946 | pmaddwd xmm1, xmm4 |
||
947 | pmaddwd xmm2, xmm4 |
||
948 | pmaddwd xmm3, xmm4 |
||
949 | psrad xmm0, %1 |
||
950 | psrad xmm1, %1 |
||
951 | psrad xmm2, %1 |
||
952 | psrad xmm3, %1 |
||
953 | packssdw xmm0, xmm1 |
||
954 | packssdw xmm2, xmm3 |
||
955 | STORE_WORDS_SSE xmm0, 0, 1, 4, 5, 2, 3, 6, 7 |
||
956 | STORE_WORDS_SSE xmm2, 8, 9, 12, 13, 10, 11, 14, 15 |
||
957 | %endmacro |
||
958 | |||
959 | %macro IDCT_DC_DEQUANT 2 |
||
960 | cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2 |
||
961 | movq m3, [r1+24] |
||
962 | movq m2, [r1+16] |
||
963 | movq m1, [r1+ 8] |
||
964 | movq m0, [r1+ 0] |
||
965 | WALSH4_1D 0,1,2,3,4 |
||
966 | TRANSPOSE4x4W 0,1,2,3,4 |
||
967 | WALSH4_1D 0,1,2,3,4 |
||
968 | |||
969 | ; shift, tmp, output, qmul |
||
970 | %ifdef WIN64 |
||
971 | DECLARE_REG_TMP 0,3,1,2 |
||
972 | ; we can't avoid this, because r0 is the shift register (ecx) on win64 |
||
973 | xchg r0, t2 |
||
974 | %elifdef ARCH_X86_64 |
||
975 | DECLARE_REG_TMP 3,1,0,2 |
||
976 | %else |
||
977 | DECLARE_REG_TMP 1,3,0,2 |
||
978 | %endif |
||
979 | |||
980 | cmp t3d, 32767 |
||
981 | jg .big_qmul |
||
982 | add t3d, 128 << 16 |
||
983 | %ifidn %1,mmx |
||
984 | DEQUANT_STORE_MMX 8 |
||
985 | %else |
||
986 | DEQUANT_STORE_SSE2 8 |
||
987 | %endif |
||
988 | RET |
||
989 | .big_qmul: |
||
990 | bsr t0d, t3d |
||
991 | add t3d, 128 << 16 |
||
992 | mov t1d, 7 |
||
993 | cmp t0d, t1d |
||
994 | cmovg t0d, t1d |
||
995 | inc t1d |
||
996 | shr t3d, t0b |
||
997 | sub t1d, t0d |
||
998 | %ifidn %1,mmx |
||
999 | movd m6, t1d |
||
1000 | DEQUANT_STORE_MMX m6 |
||
1001 | %else |
||
1002 | movd xmm6, t1d |
||
1003 | DEQUANT_STORE_SSE2 xmm6 |
||
1004 | %endif |
||
1005 | RET |
||
1006 | %endmacro |
||
1007 | |||
1008 | INIT_MMX |
||
1009 | IDCT_DC_DEQUANT mmx, 0 |
||
1010 | IDCT_DC_DEQUANT sse2, 7 |