## ffmpeg / libavcodec / x86 / dsputil_yasm.asm @ 98c6053c

History | View | Annotate | Download (9.77 KB)

1 |
;****************************************************************************** |
---|---|

2 |
;* MMX optimized DSP utils |

3 |
;* Copyright (c) 2008 Loren Merritt |

4 |
;* |

5 |
;* This file is part of FFmpeg. |

6 |
;* |

7 |
;* FFmpeg is free software; you can redistribute it and/or |

8 |
;* modify it under the terms of the GNU Lesser General Public |

9 |
;* License as published by the Free Software Foundation; either |

10 |
;* version 2.1 of the License, or (at your option) any later version. |

11 |
;* |

12 |
;* FFmpeg is distributed in the hope that it will be useful, |

13 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

14 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

15 |
;* Lesser General Public License for more details. |

16 |
;* |

17 |
;* You should have received a copy of the GNU Lesser General Public |

18 |
;* License along with FFmpeg; if not, write to the Free Software |

19 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

20 |
;****************************************************************************** |

21 | |

22 |
%include "x86inc.asm" |

23 | |

24 |
SECTION_RODATA |

25 |
pb_f: times 16 db 15 |

26 |
pb_zzzzzzzz77777777: times 8 db -1 |

27 |
pb_7: times 8 db 7 |

28 |
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 |

29 |
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 |

30 | |

31 |
section .text align=16 |

32 | |

33 |
%macro PSWAPD_SSE 2 |

34 |
pshufw %1, %2, 0x4e |

35 |
%endmacro |

36 |
%macro PSWAPD_3DN1 2 |

37 |
movq %1, %2 |

38 |
psrlq %1, 32 |

39 |
punpckldq %1, %2 |

40 |
%endmacro |

41 | |

42 |
%macro FLOAT_TO_INT16_INTERLEAVE6 1 |

43 |
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) |

44 |
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 |

45 |
%ifdef ARCH_X86_64 |

46 |
%define lend r10d |

47 |
mov lend, r2d |

48 |
%else |

49 |
%define lend dword r2m |

50 |
%endif |

51 |
mov src1q, [srcq+1*gprsize] |

52 |
mov src2q, [srcq+2*gprsize] |

53 |
mov src3q, [srcq+3*gprsize] |

54 |
mov src4q, [srcq+4*gprsize] |

55 |
mov src5q, [srcq+5*gprsize] |

56 |
mov srcq, [srcq] |

57 |
sub src1q, srcq |

58 |
sub src2q, srcq |

59 |
sub src3q, srcq |

60 |
sub src4q, srcq |

61 |
sub src5q, srcq |

62 |
.loop: |

63 |
cvtps2pi mm0, [srcq] |

64 |
cvtps2pi mm1, [srcq+src1q] |

65 |
cvtps2pi mm2, [srcq+src2q] |

66 |
cvtps2pi mm3, [srcq+src3q] |

67 |
cvtps2pi mm4, [srcq+src4q] |

68 |
cvtps2pi mm5, [srcq+src5q] |

69 |
packssdw mm0, mm3 |

70 |
packssdw mm1, mm4 |

71 |
packssdw mm2, mm5 |

72 |
pswapd mm3, mm0 |

73 |
punpcklwd mm0, mm1 |

74 |
punpckhwd mm1, mm2 |

75 |
punpcklwd mm2, mm3 |

76 |
pswapd mm3, mm0 |

77 |
punpckldq mm0, mm2 |

78 |
punpckhdq mm2, mm1 |

79 |
punpckldq mm1, mm3 |

80 |
movq [dstq ], mm0 |

81 |
movq [dstq+16], mm2 |

82 |
movq [dstq+ 8], mm1 |

83 |
add srcq, 8 |

84 |
add dstq, 24 |

85 |
sub lend, 2 |

86 |
jg .loop |

87 |
emms |

88 |
RET |

89 |
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 |

90 | |

91 |
%define pswapd PSWAPD_SSE |

92 |
FLOAT_TO_INT16_INTERLEAVE6 sse |

93 |
%define cvtps2pi pf2id |

94 |
%define pswapd PSWAPD_3DN1 |

95 |
FLOAT_TO_INT16_INTERLEAVE6 3dnow |

96 |
%undef pswapd |

97 |
FLOAT_TO_INT16_INTERLEAVE6 3dn2 |

98 |
%undef cvtps2pi |

99 | |

100 | |

101 | |

102 |
%macro SCALARPRODUCT 1 |

103 |
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) |

104 |
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift |

105 |
shl orderq, 1 |

106 |
add v1q, orderq |

107 |
add v2q, orderq |

108 |
neg orderq |

109 |
movd m3, shiftm |

110 |
pxor m2, m2 |

111 |
.loop: |

112 |
movu m0, [v1q + orderq] |

113 |
movu m1, [v1q + orderq + mmsize] |

114 |
pmaddwd m0, [v2q + orderq] |

115 |
pmaddwd m1, [v2q + orderq + mmsize] |

116 |
paddd m2, m0 |

117 |
paddd m2, m1 |

118 |
add orderq, mmsize*2 |

119 |
jl .loop |

120 |
%if mmsize == 16 |

121 |
movhlps m0, m2 |

122 |
paddd m2, m0 |

123 |
psrad m2, m3 |

124 |
pshuflw m0, m2, 0x4e |

125 |
%else |

126 |
psrad m2, m3 |

127 |
pshufw m0, m2, 0x4e |

128 |
%endif |

129 |
paddd m2, m0 |

130 |
movd eax, m2 |

131 |
RET |

132 | |

133 |
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) |

134 |
cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul |

135 |
shl orderq, 1 |

136 |
movd m7, mulm |

137 |
%if mmsize == 16 |

138 |
pshuflw m7, m7, 0 |

139 |
punpcklqdq m7, m7 |

140 |
%else |

141 |
pshufw m7, m7, 0 |

142 |
%endif |

143 |
pxor m6, m6 |

144 |
add v1q, orderq |

145 |
add v2q, orderq |

146 |
add v3q, orderq |

147 |
neg orderq |

148 |
.loop: |

149 |
movu m0, [v2q + orderq] |

150 |
movu m1, [v2q + orderq + mmsize] |

151 |
mova m4, [v1q + orderq] |

152 |
mova m5, [v1q + orderq + mmsize] |

153 |
movu m2, [v3q + orderq] |

154 |
movu m3, [v3q + orderq + mmsize] |

155 |
pmaddwd m0, m4 |

156 |
pmaddwd m1, m5 |

157 |
pmullw m2, m7 |

158 |
pmullw m3, m7 |

159 |
paddd m6, m0 |

160 |
paddd m6, m1 |

161 |
paddw m2, m4 |

162 |
paddw m3, m5 |

163 |
mova [v1q + orderq], m2 |

164 |
mova [v1q + orderq + mmsize], m3 |

165 |
add orderq, mmsize*2 |

166 |
jl .loop |

167 |
%if mmsize == 16 |

168 |
movhlps m0, m6 |

169 |
paddd m6, m0 |

170 |
pshuflw m0, m6, 0x4e |

171 |
%else |

172 |
pshufw m0, m6, 0x4e |

173 |
%endif |

174 |
paddd m6, m0 |

175 |
movd eax, m6 |

176 |
RET |

177 |
%endmacro |

178 | |

179 |
INIT_MMX |

180 |
SCALARPRODUCT mmx2 |

181 |
INIT_XMM |

182 |
SCALARPRODUCT sse2 |

183 | |

184 |
%macro SCALARPRODUCT_LOOP 1 |

185 |
align 16 |

186 |
.loop%1: |

187 |
sub orderq, mmsize*2 |

188 |
%if %1 |

189 |
mova m1, m4 |

190 |
mova m4, [v2q + orderq] |

191 |
mova m0, [v2q + orderq + mmsize] |

192 |
palignr m1, m0, %1 |

193 |
palignr m0, m4, %1 |

194 |
mova m3, m5 |

195 |
mova m5, [v3q + orderq] |

196 |
mova m2, [v3q + orderq + mmsize] |

197 |
palignr m3, m2, %1 |

198 |
palignr m2, m5, %1 |

199 |
%else |

200 |
mova m0, [v2q + orderq] |

201 |
mova m1, [v2q + orderq + mmsize] |

202 |
mova m2, [v3q + orderq] |

203 |
mova m3, [v3q + orderq + mmsize] |

204 |
%endif |

205 |
%define t0 [v1q + orderq] |

206 |
%define t1 [v1q + orderq + mmsize] |

207 |
%ifdef ARCH_X86_64 |

208 |
mova m8, t0 |

209 |
mova m9, t1 |

210 |
%define t0 m8 |

211 |
%define t1 m9 |

212 |
%endif |

213 |
pmaddwd m0, t0 |

214 |
pmaddwd m1, t1 |

215 |
pmullw m2, m7 |

216 |
pmullw m3, m7 |

217 |
paddw m2, t0 |

218 |
paddw m3, t1 |

219 |
paddd m6, m0 |

220 |
paddd m6, m1 |

221 |
mova [v1q + orderq], m2 |

222 |
mova [v1q + orderq + mmsize], m3 |

223 |
jg .loop%1 |

224 |
%if %1 |

225 |
jmp .end |

226 |
%endif |

227 |
%endmacro |

228 | |

229 |
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) |

230 |
cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul |

231 |
shl orderq, 1 |

232 |
movd m7, mulm |

233 |
pshuflw m7, m7, 0 |

234 |
punpcklqdq m7, m7 |

235 |
pxor m6, m6 |

236 |
mov r4d, v2d |

237 |
and r4d, 15 |

238 |
and v2q, ~15 |

239 |
and v3q, ~15 |

240 |
mova m4, [v2q + orderq] |

241 |
mova m5, [v3q + orderq] |

242 |
; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) |

243 |
cmp r4d, 0 |

244 |
je .loop0 |

245 |
cmp r4d, 2 |

246 |
je .loop2 |

247 |
cmp r4d, 4 |

248 |
je .loop4 |

249 |
cmp r4d, 6 |

250 |
je .loop6 |

251 |
cmp r4d, 8 |

252 |
je .loop8 |

253 |
cmp r4d, 10 |

254 |
je .loop10 |

255 |
cmp r4d, 12 |

256 |
je .loop12 |

257 |
SCALARPRODUCT_LOOP 14 |

258 |
SCALARPRODUCT_LOOP 12 |

259 |
SCALARPRODUCT_LOOP 10 |

260 |
SCALARPRODUCT_LOOP 8 |

261 |
SCALARPRODUCT_LOOP 6 |

262 |
SCALARPRODUCT_LOOP 4 |

263 |
SCALARPRODUCT_LOOP 2 |

264 |
SCALARPRODUCT_LOOP 0 |

265 |
.end: |

266 |
movhlps m0, m6 |

267 |
paddd m6, m0 |

268 |
pshuflw m0, m6, 0x4e |

269 |
paddd m6, m0 |

270 |
movd eax, m6 |

271 |
RET |

272 | |

273 | |

274 | |

275 |
; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) |

276 |
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top |

277 |
movq mm0, [topq] |

278 |
movq mm2, mm0 |

279 |
movd mm4, [left_topq] |

280 |
psllq mm2, 8 |

281 |
movq mm1, mm0 |

282 |
por mm4, mm2 |

283 |
movd mm3, [leftq] |

284 |
psubb mm0, mm4 ; t-tl |

285 |
add dstq, wq |

286 |
add topq, wq |

287 |
add diffq, wq |

288 |
neg wq |

289 |
jmp .skip |

290 |
.loop: |

291 |
movq mm4, [topq+wq] |

292 |
movq mm0, mm4 |

293 |
psllq mm4, 8 |

294 |
por mm4, mm1 |

295 |
movq mm1, mm0 ; t |

296 |
psubb mm0, mm4 ; t-tl |

297 |
.skip: |

298 |
movq mm2, [diffq+wq] |

299 |
%assign i 0 |

300 |
%rep 8 |

301 |
movq mm4, mm0 |

302 |
paddb mm4, mm3 ; t-tl+l |

303 |
movq mm5, mm3 |

304 |
pmaxub mm3, mm1 |

305 |
pminub mm5, mm1 |

306 |
pminub mm3, mm4 |

307 |
pmaxub mm3, mm5 ; median |

308 |
paddb mm3, mm2 ; +residual |

309 |
%if i==0 |

310 |
movq mm7, mm3 |

311 |
psllq mm7, 56 |

312 |
%else |

313 |
movq mm6, mm3 |

314 |
psrlq mm7, 8 |

315 |
psllq mm6, 56 |

316 |
por mm7, mm6 |

317 |
%endif |

318 |
%if i<7 |

319 |
psrlq mm0, 8 |

320 |
psrlq mm1, 8 |

321 |
psrlq mm2, 8 |

322 |
%endif |

323 |
%assign i i+1 |

324 |
%endrep |

325 |
movq [dstq+wq], mm7 |

326 |
add wq, 8 |

327 |
jl .loop |

328 |
movzx r2d, byte [dstq-1] |

329 |
mov [leftq], r2d |

330 |
movzx r2d, byte [topq-1] |

331 |
mov [left_topq], r2d |

332 |
RET |

333 | |

334 | |

335 |
%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned |

336 |
add srcq, wq |

337 |
add dstq, wq |

338 |
neg wq |

339 |
%%.loop: |

340 |
mova m1, [srcq+wq] |

341 |
mova m2, m1 |

342 |
psllw m1, 8 |

343 |
paddb m1, m2 |

344 |
mova m2, m1 |

345 |
pshufb m1, m3 |

346 |
paddb m1, m2 |

347 |
pshufb m0, m5 |

348 |
mova m2, m1 |

349 |
pshufb m1, m4 |

350 |
paddb m1, m2 |

351 |
%if mmsize == 16 |

352 |
mova m2, m1 |

353 |
pshufb m1, m6 |

354 |
paddb m1, m2 |

355 |
%endif |

356 |
paddb m0, m1 |

357 |
%if %1 |

358 |
mova [dstq+wq], m0 |

359 |
%else |

360 |
movq [dstq+wq], m0 |

361 |
movhps [dstq+wq+8], m0 |

362 |
%endif |

363 |
add wq, mmsize |

364 |
jl %%.loop |

365 |
mov eax, mmsize-1 |

366 |
sub eax, wd |

367 |
movd m1, eax |

368 |
pshufb m0, m1 |

369 |
movd eax, m0 |

370 |
RET |

371 |
%endmacro |

372 | |

373 |
; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) |

374 |
INIT_MMX |

375 |
cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left |

376 |
.skip_prologue: |

377 |
mova m5, [pb_7] |

378 |
mova m4, [pb_zzzz3333zzzzbbbb] |

379 |
mova m3, [pb_zz11zz55zz99zzdd] |

380 |
movd m0, leftm |

381 |
psllq m0, 56 |

382 |
ADD_HFYU_LEFT_LOOP 1 |

383 | |

384 |
INIT_XMM |

385 |
cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left |

386 |
mova m5, [pb_f] |

387 |
mova m6, [pb_zzzzzzzz77777777] |

388 |
mova m4, [pb_zzzz3333zzzzbbbb] |

389 |
mova m3, [pb_zz11zz55zz99zzdd] |

390 |
movd m0, leftm |

391 |
pslldq m0, 15 |

392 |
test srcq, 15 |

393 |
jnz add_hfyu_left_prediction_ssse3.skip_prologue |

394 |
test dstq, 15 |

395 |
jnz .unaligned |

396 |
ADD_HFYU_LEFT_LOOP 1 |

397 |
.unaligned: |

398 |
ADD_HFYU_LEFT_LOOP 0 |

399 | |

400 | |

401 |
; float scalarproduct_float_sse(const float *v1, const float *v2, int len) |

402 |
cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset |

403 |
neg offsetq |

404 |
shl offsetq, 2 |

405 |
sub v1q, offsetq |

406 |
sub v2q, offsetq |

407 |
xorps xmm0, xmm0 |

408 |
.loop: |

409 |
movaps xmm1, [v1q+offsetq] |

410 |
mulps xmm1, [v2q+offsetq] |

411 |
addps xmm0, xmm1 |

412 |
add offsetq, 16 |

413 |
js .loop |

414 |
movhlps xmm1, xmm0 |

415 |
addps xmm0, xmm1 |

416 |
movss xmm1, xmm0 |

417 |
shufps xmm0, xmm0, 1 |

418 |
addss xmm0, xmm1 |

419 |
%ifndef ARCH_X86_64 |

420 |
movd r0m, xmm0 |

421 |
fld dword r0m |

422 |
%endif |

423 |
RET |