Revision b1159ad9 libavcodec/x86/dsputil_yasm.asm

View differences:

libavcodec/x86/dsputil_yasm.asm
100 100

  
101 101

  
102 102
%macro SCALARPRODUCT 1
103
; void add_int16(int16_t * v1, int16_t * v2, int order)
104
cglobal add_int16_%1, 3,3,2, v1, v2, order
105
    shl orderq, 1
106
    add v1q, orderq
107
    add v2q, orderq
108
    neg orderq
109
.loop:
110
    movu    m0, [v2q + orderq]
111
    movu    m1, [v2q + orderq + mmsize]
112
    paddw   m0, [v1q + orderq]
113
    paddw   m1, [v1q + orderq + mmsize]
114
    mova    [v1q + orderq], m0
115
    mova    [v1q + orderq + mmsize], m1
116
    add     orderq, mmsize*2
117
    jl .loop
118
    REP_RET
119

  
120
; void sub_int16(int16_t * v1, int16_t * v2, int order)
121
cglobal sub_int16_%1, 3,3,4, v1, v2, order
122
    shl orderq, 1
123
    add v1q, orderq
124
    add v2q, orderq
125
    neg orderq
126
.loop:
127
    movu    m2, [v2q + orderq]
128
    movu    m3, [v2q + orderq + mmsize]
129
    mova    m0, [v1q + orderq]
130
    mova    m1, [v1q + orderq + mmsize]
131
    psubw   m0, m2
132
    psubw   m1, m3
133
    mova    [v1q + orderq], m0
134
    mova    [v1q + orderq + mmsize], m1
135
    add     orderq, mmsize*2
136
    jl .loop
137
    REP_RET
138

  
139
; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
103
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
140 104
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
141 105
    shl orderq, 1
142 106
    add v1q, orderq
......
165 129
    paddd   m2, m0
166 130
    movd   eax, m2
167 131
    RET
132

  
133
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
134
cglobal scalarproduct_and_madd_int16_%1, 3,4,8, v1, v2, v3, order, mul
135
    shl orderq, 1
136
    movd    m7, mulm
137
%if mmsize == 16
138
    pshuflw m7, m7, 0
139
    punpcklqdq m7, m7
140
%else
141
    pshufw  m7, m7, 0
142
%endif
143
    pxor    m6, m6
144
    add v1q, orderq
145
    add v2q, orderq
146
    add v3q, orderq
147
    neg orderq
148
.loop:
149
    movu    m0, [v2q + orderq]
150
    movu    m1, [v2q + orderq + mmsize]
151
    mova    m4, [v1q + orderq]
152
    mova    m5, [v1q + orderq + mmsize]
153
    movu    m2, [v3q + orderq]
154
    movu    m3, [v3q + orderq + mmsize]
155
    pmaddwd m0, m4
156
    pmaddwd m1, m5
157
    pmullw  m2, m7
158
    pmullw  m3, m7
159
    paddd   m6, m0
160
    paddd   m6, m1
161
    paddw   m2, m4
162
    paddw   m3, m5
163
    mova    [v1q + orderq], m2
164
    mova    [v1q + orderq + mmsize], m3
165
    add     orderq, mmsize*2
166
    jl .loop
167
%if mmsize == 16
168
    movhlps m0, m6
169
    paddd   m6, m0
170
    pshuflw m0, m6, 0x4e
171
%else
172
    pshufw  m0, m6, 0x4e
173
%endif
174
    paddd   m6, m0
175
    movd   eax, m6
176
    RET
168 177
%endmacro
169 178

  
170 179
INIT_MMX
......
172 181
INIT_XMM
173 182
SCALARPRODUCT sse2
174 183

  
184
%macro SCALARPRODUCT_LOOP 1
185
align 16
186
.loop%1:
187
    sub     orderq, mmsize*2
188
%if %1
189
    mova    m1, m4
190
    mova    m4, [v2q + orderq]
191
    mova    m0, [v2q + orderq + mmsize]
192
    palignr m1, m0, %1
193
    palignr m0, m4, %1
194
    mova    m3, m5
195
    mova    m5, [v3q + orderq]
196
    mova    m2, [v3q + orderq + mmsize]
197
    palignr m3, m2, %1
198
    palignr m2, m5, %1
199
%else
200
    mova    m0, [v2q + orderq]
201
    mova    m1, [v2q + orderq + mmsize]
202
    mova    m2, [v3q + orderq]
203
    mova    m3, [v3q + orderq + mmsize]
204
%endif
205
    pmaddwd m0, [v1q + orderq]
206
    pmaddwd m1, [v1q + orderq + mmsize]
207
    pmullw  m2, m7
208
    pmullw  m3, m7
209
    paddw   m2, [v1q + orderq]
210
    paddw   m3, [v1q + orderq + mmsize]
211
    paddd   m6, m0
212
    paddd   m6, m1
213
    mova    [v1q + orderq], m2
214
    mova    [v1q + orderq + mmsize], m3
215
    jg .loop%1
216
%if %1
217
    jmp .end
218
%endif
219
%endmacro
220

  
221
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
222
cglobal scalarproduct_and_madd_int16_ssse3, 4,5,8, v1, v2, v3, order, mul
223
    shl orderq, 1
224
    movd    m7, mulm
225
    pshuflw m7, m7, 0
226
    punpcklqdq m7, m7
227
    pxor    m6, m6
228
    mov    r4d, v2d
229
    and    r4d, 15
230
    and    v2q, ~15
231
    and    v3q, ~15
232
    mova    m4, [v2q + orderq]
233
    mova    m5, [v3q + orderq]
234
    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
235
    cmp    r4d, 0
236
    je .loop0
237
    cmp    r4d, 2
238
    je .loop2
239
    cmp    r4d, 4
240
    je .loop4
241
    cmp    r4d, 6
242
    je .loop6
243
    cmp    r4d, 8
244
    je .loop8
245
    cmp    r4d, 10
246
    je .loop10
247
    cmp    r4d, 12
248
    je .loop12
249
SCALARPRODUCT_LOOP 14
250
SCALARPRODUCT_LOOP 12
251
SCALARPRODUCT_LOOP 10
252
SCALARPRODUCT_LOOP 8
253
SCALARPRODUCT_LOOP 6
254
SCALARPRODUCT_LOOP 4
255
SCALARPRODUCT_LOOP 2
256
SCALARPRODUCT_LOOP 0
257
.end:
258
    movhlps m0, m6
259
    paddd   m6, m0
260
    pshuflw m0, m6, 0x4e
261
    paddd   m6, m0
262
    movd   eax, m6
263
    RET
264

  
175 265

  
176 266

  
177 267
; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)

Also available in: Unified diff