ffmpeg / libavcodec / x86 / vp8dsp.asm @ 565344e7
History | View | Annotate | Download (27.8 KB)
1 | 0178d14f | Jason Garrett-Glaser | ;****************************************************************************** |
---|---|---|---|
2 | ;* VP8 MMXEXT optimizations |
||
3 | ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> |
||
4 | ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> |
||
5 | ;* |
||
6 | ;* This file is part of FFmpeg. |
||
7 | ;* |
||
8 | ;* FFmpeg is free software; you can redistribute it and/or |
||
9 | ;* modify it under the terms of the GNU Lesser General Public |
||
10 | ;* License as published by the Free Software Foundation; either |
||
11 | ;* version 2.1 of the License, or (at your option) any later version. |
||
12 | ;* |
||
13 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
16 | ;* Lesser General Public License for more details. |
||
17 | ;* |
||
18 | ;* You should have received a copy of the GNU Lesser General Public |
||
19 | ;* License along with FFmpeg; if not, write to the Free Software |
||
20 | ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
21 | ;****************************************************************************** |
||
22 | |||
23 | %include "x86inc.asm" |
||
24 | 004cda8e | Jason Garrett-Glaser | %include "x86util.asm" |
25 | 0178d14f | Jason Garrett-Glaser | |
26 | SECTION_RODATA |
||
27 | |||
28 | fourtap_filter_hw_m: times 4 dw -6, 123 |
||
29 | times 4 dw 12, -1 |
||
30 | times 4 dw -9, 93 |
||
31 | times 4 dw 50, -6 |
||
32 | times 4 dw -6, 50 |
||
33 | times 4 dw 93, -9 |
||
34 | times 4 dw -1, 12 |
||
35 | times 4 dw 123, -6 |
||
36 | |||
37 | sixtap_filter_hw_m: times 4 dw 2, -11 |
||
38 | times 4 dw 108, 36 |
||
39 | times 4 dw -8, 1 |
||
40 | times 4 dw 3, -16 |
||
41 | times 4 dw 77, 77 |
||
42 | times 4 dw -16, 3 |
||
43 | times 4 dw 1, -8 |
||
44 | times 4 dw 36, 108 |
||
45 | times 4 dw -11, 2 |
||
46 | |||
47 | fourtap_filter_hb_m: times 8 db -6, -1 |
||
48 | times 8 db 123, 12 |
||
49 | times 8 db -9, -6 |
||
50 | times 8 db 93, 50 |
||
51 | times 8 db -6, -9 |
||
52 | times 8 db 50, 93 |
||
53 | times 8 db -1, -6 |
||
54 | times 8 db 12, 123 |
||
55 | |||
56 | sixtap_filter_hb_m: times 8 db 2, 1 |
||
57 | times 8 db -11, 108 |
||
58 | times 8 db 36, -8 |
||
59 | times 8 db 3, 3 |
||
60 | times 8 db -16, 77 |
||
61 | times 8 db 77, -16 |
||
62 | times 8 db 1, 2 |
||
63 | times 8 db -8, 36 |
||
64 | times 8 db 108, -11 |
||
65 | |||
66 | fourtap_filter_v_m: times 8 dw -6 |
||
67 | times 8 dw 123 |
||
68 | times 8 dw 12 |
||
69 | times 8 dw -1 |
||
70 | times 8 dw -9 |
||
71 | times 8 dw 93 |
||
72 | times 8 dw 50 |
||
73 | times 8 dw -6 |
||
74 | times 8 dw -6 |
||
75 | times 8 dw 50 |
||
76 | times 8 dw 93 |
||
77 | times 8 dw -9 |
||
78 | times 8 dw -1 |
||
79 | times 8 dw 12 |
||
80 | times 8 dw 123 |
||
81 | times 8 dw -6 |
||
82 | |||
83 | sixtap_filter_v_m: times 8 dw 2 |
||
84 | times 8 dw -11 |
||
85 | times 8 dw 108 |
||
86 | times 8 dw 36 |
||
87 | times 8 dw -8 |
||
88 | times 8 dw 1 |
||
89 | times 8 dw 3 |
||
90 | times 8 dw -16 |
||
91 | times 8 dw 77 |
||
92 | times 8 dw 77 |
||
93 | times 8 dw -16 |
||
94 | times 8 dw 3 |
||
95 | times 8 dw 1 |
||
96 | times 8 dw -8 |
||
97 | times 8 dw 36 |
||
98 | times 8 dw 108 |
||
99 | times 8 dw -11 |
||
100 | times 8 dw 2 |
||
101 | |||
102 | a173aa89 | Jason Garrett-Glaser | bilinear_filter_vw_m: times 8 dw 1 |
103 | times 8 dw 2 |
||
104 | times 8 dw 3 |
||
105 | times 8 dw 4 |
||
106 | times 8 dw 5 |
||
107 | times 8 dw 6 |
||
108 | times 8 dw 7 |
||
109 | |||
110 | bilinear_filter_vb_m: times 8 db 7, 1 |
||
111 | times 8 db 6, 2 |
||
112 | times 8 db 5, 3 |
||
113 | times 8 db 4, 4 |
||
114 | times 8 db 3, 5 |
||
115 | times 8 db 2, 6 |
||
116 | times 8 db 1, 7 |
||
117 | |||
118 | 0178d14f | Jason Garrett-Glaser | %ifdef PIC |
119 | a173aa89 | Jason Garrett-Glaser | %define fourtap_filter_hw r11 |
120 | %define sixtap_filter_hw r11 |
||
121 | %define fourtap_filter_hb r11 |
||
122 | %define sixtap_filter_hb r11 |
||
123 | %define fourtap_filter_v r11 |
||
124 | %define sixtap_filter_v r11 |
||
125 | %define bilinear_filter_vw r11 |
||
126 | %define bilinear_filter_vb r11 |
||
127 | 0178d14f | Jason Garrett-Glaser | %else |
128 | %define fourtap_filter_hw fourtap_filter_hw_m |
||
129 | %define sixtap_filter_hw sixtap_filter_hw_m |
||
130 | %define fourtap_filter_hb fourtap_filter_hb_m |
||
131 | %define sixtap_filter_hb sixtap_filter_hb_m |
||
132 | %define fourtap_filter_v fourtap_filter_v_m |
||
133 | %define sixtap_filter_v sixtap_filter_v_m |
||
134 | a173aa89 | Jason Garrett-Glaser | %define bilinear_filter_vw bilinear_filter_vw_m |
135 | %define bilinear_filter_vb bilinear_filter_vb_m |
||
136 | 0178d14f | Jason Garrett-Glaser | %endif |
137 | |||
138 | a173aa89 | Jason Garrett-Glaser | filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
139 | filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 |
||
140 | 0178d14f | Jason Garrett-Glaser | |
141 | a173aa89 | Jason Garrett-Glaser | filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
142 | filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 |
||
143 | filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 |
||
144 | 0178d14f | Jason Garrett-Glaser | |
145 | 2dd2f716 | Ronald S. Bultje | pw_20091: times 4 dw 20091 |
146 | pw_17734: times 4 dw 17734 |
||
147 | |||
148 | 004cda8e | Jason Garrett-Glaser | cextern pw_3 |
149 | 0178d14f | Jason Garrett-Glaser | cextern pw_4 |
150 | cextern pw_64 |
||
151 | |||
152 | SECTION .text |
||
153 | |||
154 | ;----------------------------------------------------------------------------- |
||
155 | ; subpel MC functions: |
||
156 | ; |
||
157 | ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, |
||
158 | ; uint8_t *src, int srcstride, |
||
159 | ; int height, int mx, int my); |
||
160 | ;----------------------------------------------------------------------------- |
||
161 | |||
162 | ; 4x4 block, H-only 4-tap filter |
||
163 | cglobal put_vp8_epel4_h4_mmxext, 6, 6 |
||
164 | shl r5d, 4 |
||
165 | %ifdef PIC |
||
166 | lea r11, [fourtap_filter_hw_m] |
||
167 | %endif |
||
168 | movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |
||
169 | movq mm5, [fourtap_filter_hw+r5] |
||
170 | movq mm7, [pw_64] |
||
171 | pxor mm6, mm6 |
||
172 | |||
173 | .nextrow |
||
174 | movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels |
||
175 | |||
176 | ; first set of 2 pixels |
||
177 | movq mm2, mm1 ; byte ABCD.. |
||
178 | punpcklbw mm1, mm6 ; byte->word ABCD |
||
179 | pshufw mm0, mm2, 9 ; byte CDEF.. |
||
180 | punpcklbw mm0, mm6 ; byte->word CDEF |
||
181 | pshufw mm3, mm1, 0x94 ; word ABBC |
||
182 | pshufw mm1, mm0, 0x94 ; word CDDE |
||
183 | pmaddwd mm3, mm4 ; multiply 2px with F0/F1 |
||
184 | movq mm0, mm1 ; backup for second set of pixels |
||
185 | pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |
||
186 | paddd mm3, mm1 ; finish 1st 2px |
||
187 | |||
188 | ; second set of 2 pixels, use backup of above |
||
189 | punpckhbw mm2, mm6 ; byte->word EFGH |
||
190 | pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 |
||
191 | pshufw mm1, mm2, 0x94 ; word EFFG |
||
192 | pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |
||
193 | paddd mm0, mm1 ; finish 2nd 2px |
||
194 | |||
195 | ; merge two sets of 2 pixels into one set of 4, round/clip/store |
||
196 | packssdw mm3, mm0 ; merge dword->word (4px) |
||
197 | paddsw mm3, mm7 ; rounding |
||
198 | psraw mm3, 7 |
||
199 | packuswb mm3, mm6 ; clip and word->bytes |
||
200 | movd [r0], mm3 ; store |
||
201 | |||
202 | ; go to next line |
||
203 | add r0, r1 |
||
204 | add r2, r3 |
||
205 | dec r4 ; next row |
||
206 | jg .nextrow |
||
207 | REP_RET |
||
208 | |||
209 | ; 4x4 block, H-only 6-tap filter |
||
210 | cglobal put_vp8_epel4_h6_mmxext, 6, 6 |
||
211 | lea r5d, [r5*3] |
||
212 | %ifdef PIC |
||
213 | lea r11, [sixtap_filter_hw_m] |
||
214 | %endif |
||
215 | movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words |
||
216 | movq mm5, [sixtap_filter_hw+r5*8-32] |
||
217 | movq mm6, [sixtap_filter_hw+r5*8-16] |
||
218 | movq mm7, [pw_64] |
||
219 | pxor mm3, mm3 |
||
220 | |||
221 | .nextrow |
||
222 | movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels |
||
223 | |||
224 | ; first set of 2 pixels |
||
225 | movq mm2, mm1 ; byte ABCD.. |
||
226 | punpcklbw mm1, mm3 ; byte->word ABCD |
||
227 | pshufw mm0, mm2, 0x9 ; byte CDEF.. |
||
228 | punpckhbw mm2, mm3 ; byte->word EFGH |
||
229 | punpcklbw mm0, mm3 ; byte->word CDEF |
||
230 | pshufw mm1, mm1, 0x94 ; word ABBC |
||
231 | pshufw mm2, mm2, 0x94 ; word EFFG |
||
232 | pmaddwd mm1, mm4 ; multiply 2px with F0/F1 |
||
233 | pshufw mm3, mm0, 0x94 ; word CDDE |
||
234 | movq mm0, mm3 ; backup for second set of pixels |
||
235 | pmaddwd mm3, mm5 ; multiply 2px with F2/F3 |
||
236 | paddd mm1, mm3 ; add to 1st 2px cache |
||
237 | movq mm3, mm2 ; backup for second set of pixels |
||
238 | pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |
||
239 | paddd mm1, mm2 ; finish 1st 2px |
||
240 | |||
241 | ; second set of 2 pixels, use backup of above |
||
242 | movd mm2, [r2+3] ; byte FGHI (prevent overreads) |
||
243 | pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 |
||
244 | pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 |
||
245 | paddd mm0, mm3 ; add to 2nd 2px cache |
||
246 | pxor mm3, mm3 |
||
247 | punpcklbw mm2, mm3 ; byte->word FGHI |
||
248 | pshufw mm2, mm2, 0xE9 ; word GHHI |
||
249 | pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |
||
250 | paddd mm0, mm2 ; finish 2nd 2px |
||
251 | |||
252 | ; merge two sets of 2 pixels into one set of 4, round/clip/store |
||
253 | packssdw mm1, mm0 ; merge dword->word (4px) |
||
254 | paddsw mm1, mm7 ; rounding |
||
255 | psraw mm1, 7 |
||
256 | packuswb mm1, mm3 ; clip and word->bytes |
||
257 | movd [r0], mm1 ; store |
||
258 | |||
259 | ; go to next line |
||
260 | add r0, r1 |
||
261 | add r2, r3 |
||
262 | dec r4 ; next row |
||
263 | jg .nextrow |
||
264 | REP_RET |
||
265 | |||
266 | ; 4x4 block, H-only 4-tap filter |
||
267 | INIT_XMM |
||
268 | cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 |
||
269 | shl r5d, 4 |
||
270 | %ifdef PIC |
||
271 | lea r11, [fourtap_filter_hw_m] |
||
272 | %endif |
||
273 | mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words |
||
274 | mova m6, [fourtap_filter_hw+r5] |
||
275 | pxor m7, m7 |
||
276 | |||
277 | .nextrow |
||
278 | movh m0, [r2-1] |
||
279 | punpcklbw m0, m7 ; ABCDEFGH |
||
280 | mova m1, m0 |
||
281 | mova m2, m0 |
||
282 | mova m3, m0 |
||
283 | psrldq m1, 2 ; BCDEFGH |
||
284 | psrldq m2, 4 ; CDEFGH |
||
285 | psrldq m3, 6 ; DEFGH |
||
286 | punpcklwd m0, m1 ; ABBCCDDE |
||
287 | punpcklwd m2, m3 ; CDDEEFFG |
||
288 | pmaddwd m0, m5 |
||
289 | pmaddwd m2, m6 |
||
290 | paddd m0, m2 |
||
291 | |||
292 | movh m1, [r2+3] |
||
293 | punpcklbw m1, m7 ; ABCDEFGH |
||
294 | mova m2, m1 |
||
295 | mova m3, m1 |
||
296 | mova m4, m1 |
||
297 | psrldq m2, 2 ; BCDEFGH |
||
298 | psrldq m3, 4 ; CDEFGH |
||
299 | psrldq m4, 6 ; DEFGH |
||
300 | punpcklwd m1, m2 ; ABBCCDDE |
||
301 | punpcklwd m3, m4 ; CDDEEFFG |
||
302 | pmaddwd m1, m5 |
||
303 | pmaddwd m3, m6 |
||
304 | paddd m1, m3 |
||
305 | |||
306 | packssdw m0, m1 |
||
307 | paddsw m0, [pw_64] |
||
308 | psraw m0, 7 |
||
309 | packuswb m0, m7 |
||
310 | movh [r0], m0 ; store |
||
311 | |||
312 | ; go to next line |
||
313 | add r0, r1 |
||
314 | add r2, r3 |
||
315 | dec r4 ; next row |
||
316 | jg .nextrow |
||
317 | REP_RET |
||
318 | |||
319 | cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 |
||
320 | lea r5d, [r5*3] |
||
321 | %ifdef PIC |
||
322 | lea r11, [sixtap_filter_hw_m] |
||
323 | %endif |
||
324 | lea r5, [sixtap_filter_hw+r5*8] |
||
325 | pxor m7, m7 |
||
326 | |||
327 | .nextrow |
||
328 | movu m0, [r2-2] |
||
329 | mova m6, m0 |
||
330 | mova m4, m0 |
||
331 | punpcklbw m0, m7 ; ABCDEFGHI |
||
332 | mova m1, m0 |
||
333 | mova m2, m0 |
||
334 | mova m3, m0 |
||
335 | psrldq m1, 2 ; BCDEFGH |
||
336 | psrldq m2, 4 ; CDEFGH |
||
337 | psrldq m3, 6 ; DEFGH |
||
338 | psrldq m4, 4 |
||
339 | punpcklbw m4, m7 ; EFGH |
||
340 | mova m5, m4 |
||
341 | psrldq m5, 2 ; FGH |
||
342 | punpcklwd m0, m1 ; ABBCCDDE |
||
343 | punpcklwd m2, m3 ; CDDEEFFG |
||
344 | punpcklwd m4, m5 ; EFFGGHHI |
||
345 | pmaddwd m0, [r5-48] |
||
346 | pmaddwd m2, [r5-32] |
||
347 | pmaddwd m4, [r5-16] |
||
348 | paddd m0, m2 |
||
349 | paddd m0, m4 |
||
350 | |||
351 | psrldq m6, 4 |
||
352 | mova m4, m6 |
||
353 | punpcklbw m6, m7 ; ABCDEFGHI |
||
354 | mova m1, m6 |
||
355 | mova m2, m6 |
||
356 | mova m3, m6 |
||
357 | psrldq m1, 2 ; BCDEFGH |
||
358 | psrldq m2, 4 ; CDEFGH |
||
359 | psrldq m3, 6 ; DEFGH |
||
360 | psrldq m4, 4 |
||
361 | punpcklbw m4, m7 ; EFGH |
||
362 | mova m5, m4 |
||
363 | psrldq m5, 2 ; FGH |
||
364 | punpcklwd m6, m1 ; ABBCCDDE |
||
365 | punpcklwd m2, m3 ; CDDEEFFG |
||
366 | punpcklwd m4, m5 ; EFFGGHHI |
||
367 | pmaddwd m6, [r5-48] |
||
368 | pmaddwd m2, [r5-32] |
||
369 | pmaddwd m4, [r5-16] |
||
370 | paddd m6, m2 |
||
371 | paddd m6, m4 |
||
372 | |||
373 | packssdw m0, m6 |
||
374 | paddsw m0, [pw_64] |
||
375 | psraw m0, 7 |
||
376 | packuswb m0, m7 |
||
377 | movh [r0], m0 ; store |
||
378 | |||
379 | ; go to next line |
||
380 | add r0, r1 |
||
381 | add r2, r3 |
||
382 | dec r4 ; next row |
||
383 | jg .nextrow |
||
384 | REP_RET |
||
385 | |||
386 | cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7 |
||
387 | shl r5d, 4 |
||
388 | mova m2, [pw_64] |
||
389 | a173aa89 | Jason Garrett-Glaser | mova m3, [filter_h4_shuf] |
390 | mova m4, [filter_h6_shuf2] |
||
391 | 0178d14f | Jason Garrett-Glaser | %ifdef PIC |
392 | lea r11, [fourtap_filter_hb_m] |
||
393 | %endif |
||
394 | mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |
||
395 | mova m6, [fourtap_filter_hb+r5] |
||
396 | |||
397 | .nextrow |
||
398 | movu m0, [r2-1] |
||
399 | mova m1, m0 |
||
400 | pshufb m0, m3 |
||
401 | pshufb m1, m4 |
||
402 | pmaddubsw m0, m5 |
||
403 | pmaddubsw m1, m6 |
||
404 | paddsw m0, m2 |
||
405 | paddsw m0, m1 |
||
406 | psraw m0, 7 |
||
407 | packuswb m0, m0 |
||
408 | movh [r0], m0 ; store |
||
409 | |||
410 | ; go to next line |
||
411 | add r0, r1 |
||
412 | add r2, r3 |
||
413 | dec r4 ; next row |
||
414 | jg .nextrow |
||
415 | REP_RET |
||
416 | |||
417 | cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8 |
||
418 | lea r5d, [r5*3] |
||
419 | a173aa89 | Jason Garrett-Glaser | mova m3, [filter_h6_shuf1] |
420 | mova m4, [filter_h6_shuf2] |
||
421 | 0178d14f | Jason Garrett-Glaser | %ifdef PIC |
422 | lea r11, [sixtap_filter_hb_m] |
||
423 | %endif |
||
424 | mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |
||
425 | mova m6, [sixtap_filter_hb+r5*8-32] |
||
426 | mova m7, [sixtap_filter_hb+r5*8-16] |
||
427 | |||
428 | .nextrow |
||
429 | movu m0, [r2-2] |
||
430 | mova m1, m0 |
||
431 | mova m2, m0 |
||
432 | pshufb m0, m3 |
||
433 | pshufb m1, m4 |
||
434 | a173aa89 | Jason Garrett-Glaser | pshufb m2, [filter_h6_shuf3] |
435 | 0178d14f | Jason Garrett-Glaser | pmaddubsw m0, m5 |
436 | pmaddubsw m1, m6 |
||
437 | pmaddubsw m2, m7 |
||
438 | paddsw m0, m1 |
||
439 | paddsw m0, m2 |
||
440 | paddsw m0, [pw_64] |
||
441 | psraw m0, 7 |
||
442 | packuswb m0, m0 |
||
443 | movh [r0], m0 ; store |
||
444 | |||
445 | ; go to next line |
||
446 | add r0, r1 |
||
447 | add r2, r3 |
||
448 | dec r4 ; next row |
||
449 | jg .nextrow |
||
450 | REP_RET |
||
451 | |||
452 | %macro FILTER_V 3 |
||
453 | ; 4x4 block, V-only 4-tap filter |
||
454 | cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 |
||
455 | shl r6d, 5 |
||
456 | %ifdef PIC |
||
457 | lea r11, [fourtap_filter_v_m] |
||
458 | %endif |
||
459 | lea r6, [fourtap_filter_v+r6-32] |
||
460 | mova m6, [pw_64] |
||
461 | pxor m7, m7 |
||
462 | mova m5, [r6+48] |
||
463 | |||
464 | ; read 3 lines |
||
465 | sub r2, r3 |
||
466 | movh m0, [r2] |
||
467 | movh m1, [r2+ r3] |
||
468 | movh m2, [r2+2*r3] |
||
469 | add r2, r3 |
||
470 | punpcklbw m0, m7 |
||
471 | punpcklbw m1, m7 |
||
472 | punpcklbw m2, m7 |
||
473 | |||
474 | .nextrow |
||
475 | ; first calculate negative taps (to prevent losing positive overflows) |
||
476 | movh m4, [r2+2*r3] ; read new row |
||
477 | punpcklbw m4, m7 |
||
478 | mova m3, m4 |
||
479 | pmullw m0, [r6+0] |
||
480 | pmullw m4, m5 |
||
481 | paddsw m4, m0 |
||
482 | |||
483 | ; then calculate positive taps |
||
484 | mova m0, m1 |
||
485 | pmullw m1, [r6+16] |
||
486 | paddsw m4, m1 |
||
487 | mova m1, m2 |
||
488 | pmullw m2, [r6+32] |
||
489 | paddsw m4, m2 |
||
490 | mova m2, m3 |
||
491 | |||
492 | ; round/clip/store |
||
493 | paddsw m4, m6 |
||
494 | psraw m4, 7 |
||
495 | packuswb m4, m7 |
||
496 | movh [r0], m4 |
||
497 | |||
498 | ; go to next line |
||
499 | add r0, r1 |
||
500 | add r2, r3 |
||
501 | dec r4 ; next row |
||
502 | jg .nextrow |
||
503 | REP_RET |
||
504 | |||
505 | |||
506 | ; 4x4 block, V-only 6-tap filter |
||
507 | cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 |
||
508 | shl r6d, 4 |
||
509 | lea r6, [r6*3] |
||
510 | %ifdef PIC |
||
511 | lea r11, [sixtap_filter_v_m] |
||
512 | %endif |
||
513 | lea r6, [sixtap_filter_v+r6-96] |
||
514 | pxor m7, m7 |
||
515 | |||
516 | ; read 5 lines |
||
517 | sub r2, r3 |
||
518 | sub r2, r3 |
||
519 | movh m0, [r2] |
||
520 | movh m1, [r2+r3] |
||
521 | movh m2, [r2+r3*2] |
||
522 | lea r2, [r2+r3*2] |
||
523 | add r2, r3 |
||
524 | movh m3, [r2] |
||
525 | movh m4, [r2+r3] |
||
526 | punpcklbw m0, m7 |
||
527 | punpcklbw m1, m7 |
||
528 | punpcklbw m2, m7 |
||
529 | punpcklbw m3, m7 |
||
530 | punpcklbw m4, m7 |
||
531 | |||
532 | .nextrow |
||
533 | ; first calculate negative taps (to prevent losing positive overflows) |
||
534 | mova m5, m1 |
||
535 | pmullw m5, [r6+16] |
||
536 | mova m6, m4 |
||
537 | pmullw m6, [r6+64] |
||
538 | paddsw m6, m5 |
||
539 | |||
540 | ; then calculate positive taps |
||
541 | movh m5, [r2+2*r3] ; read new row |
||
542 | punpcklbw m5, m7 |
||
543 | pmullw m0, [r6+0] |
||
544 | paddsw m6, m0 |
||
545 | mova m0, m1 |
||
546 | mova m1, m2 |
||
547 | pmullw m2, [r6+32] |
||
548 | paddsw m6, m2 |
||
549 | mova m2, m3 |
||
550 | pmullw m3, [r6+48] |
||
551 | paddsw m6, m3 |
||
552 | mova m3, m4 |
||
553 | mova m4, m5 |
||
554 | pmullw m5, [r6+80] |
||
555 | paddsw m6, m5 |
||
556 | |||
557 | ; round/clip/store |
||
558 | paddsw m6, [pw_64] |
||
559 | psraw m6, 7 |
||
560 | packuswb m6, m7 |
||
561 | movh [r0], m6 |
||
562 | |||
563 | ; go to next line |
||
564 | add r0, r1 |
||
565 | add r2, r3 |
||
566 | dec r4 ; next row |
||
567 | jg .nextrow |
||
568 | REP_RET |
||
569 | %endmacro |
||
570 | |||
571 | INIT_MMX |
||
572 | FILTER_V mmxext, 4, 0 |
||
573 | INIT_XMM |
||
574 | FILTER_V sse2, 8, 8 |
||
575 | |||
576 | cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8 |
||
577 | shl r6d, 4 |
||
578 | %ifdef PIC |
||
579 | lea r11, [fourtap_filter_hb_m] |
||
580 | %endif |
||
581 | mova m5, [fourtap_filter_hb+r6-16] |
||
582 | mova m6, [fourtap_filter_hb+r6] |
||
583 | mova m7, [pw_64] |
||
584 | |||
585 | ; read 3 lines |
||
586 | sub r2, r3 |
||
587 | movh m0, [r2] |
||
588 | movh m1, [r2+ r3] |
||
589 | movh m2, [r2+2*r3] |
||
590 | add r2, r3 |
||
591 | |||
592 | .nextrow |
||
593 | movh m3, [r2+2*r3] ; read new row |
||
594 | mova m4, m0 |
||
595 | mova m0, m1 |
||
596 | punpcklbw m4, m3 |
||
597 | punpcklbw m1, m2 |
||
598 | pmaddubsw m4, m5 |
||
599 | pmaddubsw m1, m6 |
||
600 | paddsw m4, m1 |
||
601 | mova m1, m2 |
||
602 | paddsw m4, m7 |
||
603 | mova m2, m3 |
||
604 | psraw m4, 7 |
||
605 | packuswb m4, m4 |
||
606 | movh [r0], m4 |
||
607 | |||
608 | ; go to next line |
||
609 | add r0, r1 |
||
610 | add r2, r3 |
||
611 | dec r4 ; next row |
||
612 | jg .nextrow |
||
613 | REP_RET |
||
614 | |||
615 | cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8 |
||
616 | lea r6d, [r6*3] |
||
617 | %ifdef PIC |
||
618 | lea r11, [sixtap_filter_hb_m] |
||
619 | %endif |
||
620 | lea r6, [sixtap_filter_hb+r6*8] |
||
621 | |||
622 | ; read 5 lines |
||
623 | sub r2, r3 |
||
624 | sub r2, r3 |
||
625 | movh m0, [r2] |
||
626 | movh m1, [r2+r3] |
||
627 | movh m2, [r2+r3*2] |
||
628 | lea r2, [r2+r3*2] |
||
629 | add r2, r3 |
||
630 | movh m3, [r2] |
||
631 | movh m4, [r2+r3] |
||
632 | |||
633 | .nextrow |
||
634 | movh m5, [r2+2*r3] ; read new row |
||
635 | mova m6, m0 |
||
636 | punpcklbw m6, m5 |
||
637 | mova m0, m1 |
||
638 | punpcklbw m1, m2 |
||
639 | mova m7, m3 |
||
640 | punpcklbw m7, m4 |
||
641 | pmaddubsw m6, [r6-48] |
||
642 | pmaddubsw m1, [r6-32] |
||
643 | pmaddubsw m7, [r6-16] |
||
644 | paddsw m6, m1 |
||
645 | paddsw m6, m7 |
||
646 | mova m1, m2 |
||
647 | paddsw m6, [pw_64] |
||
648 | mova m2, m3 |
||
649 | psraw m6, 7 |
||
650 | mova m3, m4 |
||
651 | packuswb m6, m6 |
||
652 | mova m4, m5 |
||
653 | movh [r0], m6 |
||
654 | |||
655 | ; go to next line |
||
656 | add r0, r1 |
||
657 | add r2, r3 |
||
658 | dec r4 ; next row |
||
659 | jg .nextrow |
||
660 | REP_RET |
||
661 | |||
662 | a173aa89 | Jason Garrett-Glaser | %macro FILTER_BILINEAR 3 |
663 | cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 |
||
664 | mov r5d, 8*16 |
||
665 | shl r6d, 4 |
||
666 | sub r5d, r6d |
||
667 | %ifdef PIC |
||
668 | lea r11, [bilinear_filter_vw_m] |
||
669 | %endif |
||
670 | pxor m6, m6 |
||
671 | a912da76 | Jason Garrett-Glaser | mova m4, [bilinear_filter_vw+r5-16] |
672 | mova m5, [bilinear_filter_vw+r6-16] |
||
673 | a173aa89 | Jason Garrett-Glaser | .nextrow |
674 | movh m0, [r2+r3*0] |
||
675 | movh m1, [r2+r3*1] |
||
676 | movh m3, [r2+r3*2] |
||
677 | punpcklbw m0, m6 |
||
678 | punpcklbw m1, m6 |
||
679 | punpcklbw m3, m6 |
||
680 | mova m2, m1 |
||
681 | pmullw m0, m4 |
||
682 | pmullw m1, m5 |
||
683 | pmullw m2, m4 |
||
684 | pmullw m3, m5 |
||
685 | paddsw m0, m1 |
||
686 | paddsw m2, m3 |
||
687 | psraw m0, 2 |
||
688 | psraw m2, 2 |
||
689 | pavgw m0, m6 |
||
690 | pavgw m2, m6 |
||
691 | %ifidn %1, mmxext |
||
692 | packuswb m0, m0 |
||
693 | packuswb m2, m2 |
||
694 | movh [r0+r1*0], m0 |
||
695 | movh [r0+r1*1], m2 |
||
696 | %else |
||
697 | packuswb m0, m2 |
||
698 | movh [r0+r1*0], m0 |
||
699 | movhps [r0+r1*1], m0 |
||
700 | %endif |
||
701 | |||
702 | lea r0, [r0+r1*2] |
||
703 | lea r2, [r2+r3*2] |
||
704 | sub r4, 2 |
||
705 | jg .nextrow |
||
706 | REP_RET |
||
707 | |||
708 | cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 |
||
709 | mov r6d, 8*16 |
||
710 | shl r5d, 4 |
||
711 | sub r6d, r5d |
||
712 | %ifdef PIC |
||
713 | lea r11, [bilinear_filter_vw_m] |
||
714 | %endif |
||
715 | pxor m6, m6 |
||
716 | a912da76 | Jason Garrett-Glaser | mova m4, [bilinear_filter_vw+r6-16] |
717 | mova m5, [bilinear_filter_vw+r5-16] |
||
718 | a173aa89 | Jason Garrett-Glaser | .nextrow |
719 | movh m0, [r2+r3*0+0] |
||
720 | movh m1, [r2+r3*0+1] |
||
721 | movh m2, [r2+r3*1+0] |
||
722 | movh m3, [r2+r3*1+1] |
||
723 | punpcklbw m0, m6 |
||
724 | punpcklbw m1, m6 |
||
725 | punpcklbw m2, m6 |
||
726 | punpcklbw m3, m6 |
||
727 | pmullw m0, m4 |
||
728 | pmullw m1, m5 |
||
729 | pmullw m2, m4 |
||
730 | pmullw m3, m5 |
||
731 | paddsw m0, m1 |
||
732 | paddsw m2, m3 |
||
733 | psraw m0, 2 |
||
734 | psraw m2, 2 |
||
735 | pavgw m0, m6 |
||
736 | pavgw m2, m6 |
||
737 | %ifidn %1, mmxext |
||
738 | packuswb m0, m0 |
||
739 | packuswb m2, m2 |
||
740 | movh [r0+r1*0], m0 |
||
741 | movh [r0+r1*1], m2 |
||
742 | %else |
||
743 | packuswb m0, m2 |
||
744 | movh [r0+r1*0], m0 |
||
745 | movhps [r0+r1*1], m0 |
||
746 | %endif |
||
747 | |||
748 | lea r0, [r0+r1*2] |
||
749 | lea r2, [r2+r3*2] |
||
750 | sub r4, 2 |
||
751 | jg .nextrow |
||
752 | REP_RET |
||
753 | %endmacro |
||
754 | |||
755 | INIT_MMX |
||
756 | FILTER_BILINEAR mmxext, 4, 0 |
||
757 | INIT_XMM |
||
758 | FILTER_BILINEAR sse2, 8, 7 |
||
759 | |||
760 | cglobal put_vp8_bilinear8_v_ssse3, 7,7,5 |
||
761 | shl r6d, 4 |
||
762 | %ifdef PIC |
||
763 | lea r11, [bilinear_filter_vb_m] |
||
764 | %endif |
||
765 | pxor m4, m4 |
||
766 | a912da76 | Jason Garrett-Glaser | mova m3, [bilinear_filter_vb+r6-16] |
767 | a173aa89 | Jason Garrett-Glaser | .nextrow |
768 | movh m0, [r2+r3*0] |
||
769 | movh m1, [r2+r3*1] |
||
770 | movh m2, [r2+r3*2] |
||
771 | punpcklbw m0, m1 |
||
772 | punpcklbw m1, m2 |
||
773 | pmaddubsw m0, m3 |
||
774 | pmaddubsw m1, m3 |
||
775 | psraw m0, 2 |
||
776 | psraw m1, 2 |
||
777 | pavgw m0, m4 |
||
778 | pavgw m1, m4 |
||
779 | packuswb m0, m1 |
||
780 | movh [r0+r1*0], m0 |
||
781 | movhps [r0+r1*1], m0 |
||
782 | |||
783 | lea r0, [r0+r1*2] |
||
784 | lea r2, [r2+r3*2] |
||
785 | sub r4, 2 |
||
786 | jg .nextrow |
||
787 | REP_RET |
||
788 | |||
789 | cglobal put_vp8_bilinear8_h_ssse3, 7,7,5 |
||
790 | shl r5d, 4 |
||
791 | %ifdef PIC |
||
792 | lea r11, [bilinear_filter_vb_m] |
||
793 | %endif |
||
794 | pxor m4, m4 |
||
795 | mova m2, [filter_h2_shuf] |
||
796 | a912da76 | Jason Garrett-Glaser | mova m3, [bilinear_filter_vb+r5-16] |
797 | a173aa89 | Jason Garrett-Glaser | .nextrow |
798 | movu m0, [r2+r3*0] |
||
799 | movu m1, [r2+r3*1] |
||
800 | pshufb m0, m2 |
||
801 | pshufb m1, m2 |
||
802 | pmaddubsw m0, m3 |
||
803 | pmaddubsw m1, m3 |
||
804 | psraw m0, 2 |
||
805 | psraw m1, 2 |
||
806 | pavgw m0, m4 |
||
807 | pavgw m1, m4 |
||
808 | packuswb m0, m1 |
||
809 | movh [r0+r1*0], m0 |
||
810 | movhps [r0+r1*1], m0 |
||
811 | |||
812 | lea r0, [r0+r1*2] |
||
813 | lea r2, [r2+r3*2] |
||
814 | sub r4, 2 |
||
815 | jg .nextrow |
||
816 | REP_RET |
||
817 | |||
818 | 0fecad09 | Jason Garrett-Glaser | cglobal put_vp8_pixels8_mmx, 5,5 |
819 | .nextrow: |
||
820 | movq mm0, [r2+r3*0] |
||
821 | movq mm1, [r2+r3*1] |
||
822 | lea r2, [r2+r3*2] |
||
823 | movq [r0+r1*0], mm0 |
||
824 | movq [r0+r1*1], mm1 |
||
825 | lea r0, [r0+r1*2] |
||
826 | sub r4d, 2 |
||
827 | jg .nextrow |
||
828 | REP_RET |
||
829 | |||
830 | cglobal put_vp8_pixels16_mmx, 5,5 |
||
831 | .nextrow: |
||
832 | movq mm0, [r2+r3*0+0] |
||
833 | movq mm1, [r2+r3*0+8] |
||
834 | movq mm2, [r2+r3*1+0] |
||
835 | movq mm3, [r2+r3*1+8] |
||
836 | lea r2, [r2+r3*2] |
||
837 | movq [r0+r1*0+0], mm0 |
||
838 | movq [r0+r1*0+8], mm1 |
||
839 | movq [r0+r1*1+0], mm2 |
||
840 | movq [r0+r1*1+8], mm3 |
||
841 | lea r0, [r0+r1*2] |
||
842 | sub r4d, 2 |
||
843 | jg .nextrow |
||
844 | REP_RET |
||
845 | |||
846 | cglobal put_vp8_pixels16_sse, 5,5,2 |
||
847 | .nextrow: |
||
848 | movups xmm0, [r2+r3*0] |
||
849 | movups xmm1, [r2+r3*1] |
||
850 | lea r2, [r2+r3*2] |
||
851 | movaps [r0+r1*0], xmm0 |
||
852 | movaps [r0+r1*1], xmm1 |
||
853 | lea r0, [r0+r1*2] |
||
854 | sub r4d, 2 |
||
855 | jg .nextrow |
||
856 | REP_RET |
||
857 | |||
858 | 0178d14f | Jason Garrett-Glaser | ;----------------------------------------------------------------------------- |
859 | ; IDCT functions: |
||
860 | ; |
||
861 | ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
||
862 | ;----------------------------------------------------------------------------- |
||
863 | |||
864 | cglobal vp8_idct_dc_add_mmx, 3, 3 |
||
865 | ; load data |
||
866 | movd mm0, [r1] |
||
867 | |||
868 | ; calculate DC |
||
869 | paddw mm0, [pw_4] |
||
870 | pxor mm1, mm1 |
||
871 | psraw mm0, 3 |
||
872 | psubw mm1, mm0 |
||
873 | packuswb mm0, mm0 |
||
874 | packuswb mm1, mm1 |
||
875 | punpcklbw mm0, mm0 |
||
876 | punpcklbw mm1, mm1 |
||
877 | punpcklwd mm0, mm0 |
||
878 | punpcklwd mm1, mm1 |
||
879 | |||
880 | ; add DC |
||
881 | lea r1, [r0+r2*2] |
||
882 | movd mm2, [r0] |
||
883 | movd mm3, [r0+r2] |
||
884 | movd mm4, [r1] |
||
885 | movd mm5, [r1+r2] |
||
886 | paddusb mm2, mm0 |
||
887 | paddusb mm3, mm0 |
||
888 | paddusb mm4, mm0 |
||
889 | paddusb mm5, mm0 |
||
890 | psubusb mm2, mm1 |
||
891 | psubusb mm3, mm1 |
||
892 | psubusb mm4, mm1 |
||
893 | psubusb mm5, mm1 |
||
894 | movd [r0], mm2 |
||
895 | movd [r0+r2], mm3 |
||
896 | movd [r1], mm4 |
||
897 | movd [r1+r2], mm5 |
||
898 | RET |
||
899 | |||
900 | cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |
||
901 | ; load data |
||
902 | movd xmm0, [r1] |
||
903 | lea r1, [r0+r2*2] |
||
904 | pxor xmm1, xmm1 |
||
905 | movq xmm2, [pw_4] |
||
906 | |||
907 | ; calculate DC |
||
908 | paddw xmm0, xmm2 |
||
909 | movd xmm2, [r0] |
||
910 | movd xmm3, [r0+r2] |
||
911 | movd xmm4, [r1] |
||
912 | movd xmm5, [r1+r2] |
||
913 | psraw xmm0, 3 |
||
914 | pshuflw xmm0, xmm0, 0 |
||
915 | punpcklqdq xmm0, xmm0 |
||
916 | punpckldq xmm2, xmm3 |
||
917 | punpckldq xmm4, xmm5 |
||
918 | punpcklbw xmm2, xmm1 |
||
919 | punpcklbw xmm4, xmm1 |
||
920 | paddw xmm2, xmm0 |
||
921 | paddw xmm4, xmm0 |
||
922 | packuswb xmm2, xmm4 |
||
923 | movd [r0], xmm2 |
||
924 | pextrd [r0+r2], xmm2, 1 |
||
925 | pextrd [r1], xmm2, 2 |
||
926 | pextrd [r1+r2], xmm2, 3 |
||
927 | RET |
||
928 | 004cda8e | Jason Garrett-Glaser | |
929 | ;----------------------------------------------------------------------------- |
||
930 | 2dd2f716 | Ronald S. Bultje | ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
931 | ;----------------------------------------------------------------------------- |
||
932 | |||
933 | ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) |
||
934 | ; this macro assumes that m6/m7 have words for 20091/17734 loaded |
||
935 | %macro VP8_MULTIPLY_SUMSUB 4 |
||
936 | mova %3, %1 |
||
937 | mova %4, %2 |
||
938 | pmulhw %3, m6 ;20091(1) |
||
939 | pmulhw %4, m6 ;20091(2) |
||
940 | paddw %3, %1 |
||
941 | paddw %4, %2 |
||
942 | psllw %1, 1 |
||
943 | psllw %2, 1 |
||
944 | pmulhw %1, m7 ;35468(1) |
||
945 | pmulhw %2, m7 ;35468(2) |
||
946 | psubw %1, %4 |
||
947 | paddw %2, %3 |
||
948 | %endmacro |
||
949 | |||
950 | ; calculate x0=%1+%3; x1=%1-%3 |
||
951 | ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) |
||
952 | ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) |
||
953 | ; %5/%6 are temporary registers |
||
954 | ; we assume m6/m7 have constant words 20091/17734 loaded in them |
||
955 | %macro VP8_IDCT_TRANSFORM4x4_1D 6 |
||
956 | SUMSUB_BA m%3, m%1, m%5 ;t0, t1 |
||
957 | VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 |
||
958 | SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 |
||
959 | SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 |
||
960 | SWAP %4, %1 |
||
961 | SWAP %4, %3 |
||
962 | %endmacro |
||
963 | |||
964 | INIT_MMX |
||
965 | cglobal vp8_idct_add_mmx, 3, 3 |
||
966 | ; load block data |
||
967 | movq m0, [r1] |
||
968 | movq m1, [r1+8] |
||
969 | movq m2, [r1+16] |
||
970 | movq m3, [r1+24] |
||
971 | movq m6, [pw_20091] |
||
972 | movq m7, [pw_17734] |
||
973 | |||
974 | ; actual IDCT |
||
975 | VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |
||
976 | TRANSPOSE4x4W 0, 1, 2, 3, 4 |
||
977 | paddw m0, [pw_4] |
||
978 | VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |
||
979 | TRANSPOSE4x4W 0, 1, 2, 3, 4 |
||
980 | |||
981 | ; store |
||
982 | pxor m4, m4 |
||
983 | lea r1, [r0+2*r2] |
||
984 | STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 |
||
985 | STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 |
||
986 | |||
987 | RET |
||
988 | |||
989 | ;----------------------------------------------------------------------------- |
||
990 | 004cda8e | Jason Garrett-Glaser | ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |
991 | ;----------------------------------------------------------------------------- |
||
992 | |||
993 | %macro SCATTER_WHT 1 |
||
994 | pextrw r1d, m0, %1 |
||
995 | pextrw r2d, m1, %1 |
||
996 | mov [r0+2*16*0], r1w |
||
997 | mov [r0+2*16*1], r2w |
||
998 | pextrw r1d, m2, %1 |
||
999 | pextrw r2d, m3, %1 |
||
1000 | mov [r0+2*16*2], r1w |
||
1001 | mov [r0+2*16*3], r2w |
||
1002 | %endmacro |
||
1003 | |||
1004 | %macro HADAMARD4_1D 4 |
||
1005 | SUMSUB_BADC m%2, m%1, m%4, m%3 |
||
1006 | SUMSUB_BADC m%4, m%2, m%3, m%1 |
||
1007 | SWAP %1, %4, %3 |
||
1008 | %endmacro |
||
1009 | |||
1010 | INIT_MMX |
||
1011 | cglobal vp8_luma_dc_wht_mmxext, 2,3 |
||
1012 | movq m0, [r1] |
||
1013 | movq m1, [r1+8] |
||
1014 | movq m2, [r1+16] |
||
1015 | movq m3, [r1+24] |
||
1016 | HADAMARD4_1D 0, 1, 2, 3 |
||
1017 | TRANSPOSE4x4W 0, 1, 2, 3, 4 |
||
1018 | paddw m0, [pw_3] |
||
1019 | HADAMARD4_1D 0, 1, 2, 3 |
||
1020 | psraw m0, 3 |
||
1021 | psraw m1, 3 |
||
1022 | psraw m2, 3 |
||
1023 | psraw m3, 3 |
||
1024 | SCATTER_WHT 0 |
||
1025 | add r0, 2*16*4 |
||
1026 | SCATTER_WHT 1 |
||
1027 | add r0, 2*16*4 |
||
1028 | SCATTER_WHT 2 |
||
1029 | add r0, 2*16*4 |
||
1030 | SCATTER_WHT 3 |
||
1031 | RET |