Revision b1159ad9 libavcodec/x86/dsputil_yasm.asm
libavcodec/x86/dsputil_yasm.asm | ||
---|---|---|
100 | 100 |
|
101 | 101 |
|
102 | 102 |
%macro SCALARPRODUCT 1 |
103 |
; void add_int16(int16_t * v1, int16_t * v2, int order) |
|
104 |
cglobal add_int16_%1, 3,3,2, v1, v2, order |
|
105 |
shl orderq, 1 |
|
106 |
add v1q, orderq |
|
107 |
add v2q, orderq |
|
108 |
neg orderq |
|
109 |
.loop: |
|
110 |
movu m0, [v2q + orderq] |
|
111 |
movu m1, [v2q + orderq + mmsize] |
|
112 |
paddw m0, [v1q + orderq] |
|
113 |
paddw m1, [v1q + orderq + mmsize] |
|
114 |
mova [v1q + orderq], m0 |
|
115 |
mova [v1q + orderq + mmsize], m1 |
|
116 |
add orderq, mmsize*2 |
|
117 |
jl .loop |
|
118 |
REP_RET |
|
119 |
|
|
120 |
; void sub_int16(int16_t * v1, int16_t * v2, int order) |
|
121 |
cglobal sub_int16_%1, 3,3,4, v1, v2, order |
|
122 |
shl orderq, 1 |
|
123 |
add v1q, orderq |
|
124 |
add v2q, orderq |
|
125 |
neg orderq |
|
126 |
.loop: |
|
127 |
movu m2, [v2q + orderq] |
|
128 |
movu m3, [v2q + orderq + mmsize] |
|
129 |
mova m0, [v1q + orderq] |
|
130 |
mova m1, [v1q + orderq + mmsize] |
|
131 |
psubw m0, m2 |
|
132 |
psubw m1, m3 |
|
133 |
mova [v1q + orderq], m0 |
|
134 |
mova [v1q + orderq + mmsize], m1 |
|
135 |
add orderq, mmsize*2 |
|
136 |
jl .loop |
|
137 |
REP_RET |
|
138 |
|
|
139 |
; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift) |
|
103 |
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) |
|
140 | 104 |
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift |
141 | 105 |
shl orderq, 1 |
142 | 106 |
add v1q, orderq |
... | ... | |
165 | 129 |
paddd m2, m0 |
166 | 130 |
movd eax, m2 |
167 | 131 |
RET |
132 |
|
|
133 |
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) |
|
134 |
cglobal scalarproduct_and_madd_int16_%1, 3,4,8, v1, v2, v3, order, mul |
|
135 |
shl orderq, 1 |
|
136 |
movd m7, mulm |
|
137 |
%if mmsize == 16 |
|
138 |
pshuflw m7, m7, 0 |
|
139 |
punpcklqdq m7, m7 |
|
140 |
%else |
|
141 |
pshufw m7, m7, 0 |
|
142 |
%endif |
|
143 |
pxor m6, m6 |
|
144 |
add v1q, orderq |
|
145 |
add v2q, orderq |
|
146 |
add v3q, orderq |
|
147 |
neg orderq |
|
148 |
.loop: |
|
149 |
movu m0, [v2q + orderq] |
|
150 |
movu m1, [v2q + orderq + mmsize] |
|
151 |
mova m4, [v1q + orderq] |
|
152 |
mova m5, [v1q + orderq + mmsize] |
|
153 |
movu m2, [v3q + orderq] |
|
154 |
movu m3, [v3q + orderq + mmsize] |
|
155 |
pmaddwd m0, m4 |
|
156 |
pmaddwd m1, m5 |
|
157 |
pmullw m2, m7 |
|
158 |
pmullw m3, m7 |
|
159 |
paddd m6, m0 |
|
160 |
paddd m6, m1 |
|
161 |
paddw m2, m4 |
|
162 |
paddw m3, m5 |
|
163 |
mova [v1q + orderq], m2 |
|
164 |
mova [v1q + orderq + mmsize], m3 |
|
165 |
add orderq, mmsize*2 |
|
166 |
jl .loop |
|
167 |
%if mmsize == 16 |
|
168 |
movhlps m0, m6 |
|
169 |
paddd m6, m0 |
|
170 |
pshuflw m0, m6, 0x4e |
|
171 |
%else |
|
172 |
pshufw m0, m6, 0x4e |
|
173 |
%endif |
|
174 |
paddd m6, m0 |
|
175 |
movd eax, m6 |
|
176 |
RET |
|
168 | 177 |
%endmacro |
169 | 178 |
|
170 | 179 |
INIT_MMX |
... | ... | |
172 | 181 |
INIT_XMM |
173 | 182 |
SCALARPRODUCT sse2 |
174 | 183 |
|
184 |
%macro SCALARPRODUCT_LOOP 1 |
|
185 |
align 16 |
|
186 |
.loop%1: |
|
187 |
sub orderq, mmsize*2 |
|
188 |
%if %1 |
|
189 |
mova m1, m4 |
|
190 |
mova m4, [v2q + orderq] |
|
191 |
mova m0, [v2q + orderq + mmsize] |
|
192 |
palignr m1, m0, %1 |
|
193 |
palignr m0, m4, %1 |
|
194 |
mova m3, m5 |
|
195 |
mova m5, [v3q + orderq] |
|
196 |
mova m2, [v3q + orderq + mmsize] |
|
197 |
palignr m3, m2, %1 |
|
198 |
palignr m2, m5, %1 |
|
199 |
%else |
|
200 |
mova m0, [v2q + orderq] |
|
201 |
mova m1, [v2q + orderq + mmsize] |
|
202 |
mova m2, [v3q + orderq] |
|
203 |
mova m3, [v3q + orderq + mmsize] |
|
204 |
%endif |
|
205 |
pmaddwd m0, [v1q + orderq] |
|
206 |
pmaddwd m1, [v1q + orderq + mmsize] |
|
207 |
pmullw m2, m7 |
|
208 |
pmullw m3, m7 |
|
209 |
paddw m2, [v1q + orderq] |
|
210 |
paddw m3, [v1q + orderq + mmsize] |
|
211 |
paddd m6, m0 |
|
212 |
paddd m6, m1 |
|
213 |
mova [v1q + orderq], m2 |
|
214 |
mova [v1q + orderq + mmsize], m3 |
|
215 |
jg .loop%1 |
|
216 |
%if %1 |
|
217 |
jmp .end |
|
218 |
%endif |
|
219 |
%endmacro |
|
220 |
|
|
221 |
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) |
|
222 |
cglobal scalarproduct_and_madd_int16_ssse3, 4,5,8, v1, v2, v3, order, mul |
|
223 |
shl orderq, 1 |
|
224 |
movd m7, mulm |
|
225 |
pshuflw m7, m7, 0 |
|
226 |
punpcklqdq m7, m7 |
|
227 |
pxor m6, m6 |
|
228 |
mov r4d, v2d |
|
229 |
and r4d, 15 |
|
230 |
and v2q, ~15 |
|
231 |
and v3q, ~15 |
|
232 |
mova m4, [v2q + orderq] |
|
233 |
mova m5, [v3q + orderq] |
|
234 |
; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) |
|
235 |
cmp r4d, 0 |
|
236 |
je .loop0 |
|
237 |
cmp r4d, 2 |
|
238 |
je .loop2 |
|
239 |
cmp r4d, 4 |
|
240 |
je .loop4 |
|
241 |
cmp r4d, 6 |
|
242 |
je .loop6 |
|
243 |
cmp r4d, 8 |
|
244 |
je .loop8 |
|
245 |
cmp r4d, 10 |
|
246 |
je .loop10 |
|
247 |
cmp r4d, 12 |
|
248 |
je .loop12 |
|
249 |
SCALARPRODUCT_LOOP 14 |
|
250 |
SCALARPRODUCT_LOOP 12 |
|
251 |
SCALARPRODUCT_LOOP 10 |
|
252 |
SCALARPRODUCT_LOOP 8 |
|
253 |
SCALARPRODUCT_LOOP 6 |
|
254 |
SCALARPRODUCT_LOOP 4 |
|
255 |
SCALARPRODUCT_LOOP 2 |
|
256 |
SCALARPRODUCT_LOOP 0 |
|
257 |
.end: |
|
258 |
movhlps m0, m6 |
|
259 |
paddd m6, m0 |
|
260 |
pshuflw m0, m6, 0x4e |
|
261 |
paddd m6, m0 |
|
262 |
movd eax, m6 |
|
263 |
RET |
|
264 |
|
|
175 | 265 |
|
176 | 266 |
|
177 | 267 |
; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) |
Also available in: Unified diff