## ffmpeg / libavcodec / x86 / dsputil_yasm.asm @ 98c6053c

History | View | Annotate | Download (9.77 KB)

1 | 7ca7d5fa | Loren Merritt | ;****************************************************************************** |
---|---|---|---|

2 | ;* MMX optimized DSP utils |
||

3 | ;* Copyright (c) 2008 Loren Merritt |
||

4 | ;* |
||

5 | ;* This file is part of FFmpeg. |
||

6 | ;* |
||

7 | ;* FFmpeg is free software; you can redistribute it and/or |
||

8 | ;* modify it under the terms of the GNU Lesser General Public |
||

9 | ;* License as published by the Free Software Foundation; either |
||

10 | ;* version 2.1 of the License, or (at your option) any later version. |
||

11 | ;* |
||

12 | ;* FFmpeg is distributed in the hope that it will be useful, |
||

13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||

14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||

15 | ;* Lesser General Public License for more details. |
||

16 | ;* |
||

17 | ;* You should have received a copy of the GNU Lesser General Public |
||

18 | ;* License along with FFmpeg; if not, write to the Free Software |
||

19 | ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||

20 | ;****************************************************************************** |
||

21 | |||

22 | %include "x86inc.asm" |
||

23 | |||

24 | 2f77923d | Loren Merritt | SECTION_RODATA |

25 | pb_f: times 16 db 15 |
||

26 | pb_zzzzzzzz77777777: times 8 db -1 |
||

27 | pb_7: times 8 db 7 |
||

28 | pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 |
||

29 | pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 |
||

30 | |||

31 | 7ca7d5fa | Loren Merritt | section .text align=16 |

32 | |||

33 | %macro PSWAPD_SSE 2 |
||

34 | pshufw %1, %2, 0x4e |
||

35 | %endmacro |
||

36 | %macro PSWAPD_3DN1 2 |
||

37 | movq %1, %2 |
||

38 | psrlq %1, 32 |
||

39 | punpckldq %1, %2 |
||

40 | %endmacro |
||

41 | |||

42 | %macro FLOAT_TO_INT16_INTERLEAVE6 1 |
||

43 | 2966cc18 | Jason Garrett-Glaser | ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) |

44 | 40c7d0ae | Jason Garrett-Glaser | cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 |

45 | 7ca7d5fa | Loren Merritt | %ifdef ARCH_X86_64 |

46 | %define lend r10d |
||

47 | mov lend, r2d |
||

48 | %else |
||

49 | %define lend dword r2m |
||

50 | %endif |
||

51 | mov src1q, [srcq+1*gprsize] |
||

52 | mov src2q, [srcq+2*gprsize] |
||

53 | mov src3q, [srcq+3*gprsize] |
||

54 | mov src4q, [srcq+4*gprsize] |
||

55 | mov src5q, [srcq+5*gprsize] |
||

56 | mov srcq, [srcq] |
||

57 | sub src1q, srcq |
||

58 | sub src2q, srcq |
||

59 | sub src3q, srcq |
||

60 | sub src4q, srcq |
||

61 | sub src5q, srcq |
||

62 | .loop: |
||

63 | cvtps2pi mm0, [srcq] |
||

64 | cvtps2pi mm1, [srcq+src1q] |
||

65 | cvtps2pi mm2, [srcq+src2q] |
||

66 | cvtps2pi mm3, [srcq+src3q] |
||

67 | cvtps2pi mm4, [srcq+src4q] |
||

68 | cvtps2pi mm5, [srcq+src5q] |
||

69 | packssdw mm0, mm3 |
||

70 | packssdw mm1, mm4 |
||

71 | packssdw mm2, mm5 |
||

72 | pswapd mm3, mm0 |
||

73 | punpcklwd mm0, mm1 |
||

74 | punpckhwd mm1, mm2 |
||

75 | punpcklwd mm2, mm3 |
||

76 | pswapd mm3, mm0 |
||

77 | punpckldq mm0, mm2 |
||

78 | punpckhdq mm2, mm1 |
||

79 | punpckldq mm1, mm3 |
||

80 | movq [dstq ], mm0 |
||

81 | movq [dstq+16], mm2 |
||

82 | movq [dstq+ 8], mm1 |
||

83 | add srcq, 8 |
||

84 | add dstq, 24 |
||

85 | sub lend, 2 |
||

86 | jg .loop |
||

87 | emms |
||

88 | RET |
||

89 | %endmacro ; FLOAT_TO_INT16_INTERLEAVE6 |
||

90 | |||

91 | %define pswapd PSWAPD_SSE |
||

92 | FLOAT_TO_INT16_INTERLEAVE6 sse |
||

93 | %define cvtps2pi pf2id |
||

94 | %define pswapd PSWAPD_3DN1 |
||

95 | FLOAT_TO_INT16_INTERLEAVE6 3dnow |
||

96 | %undef pswapd |
||

97 | FLOAT_TO_INT16_INTERLEAVE6 3dn2 |
||

98 | %undef cvtps2pi |
||

99 | |||

100 | 3daa434a | Loren Merritt | |

101 | |||

102 | b10fa1bb | Loren Merritt | %macro SCALARPRODUCT 1 |

103 | b1159ad9 | Loren Merritt | ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) |

104 | b10fa1bb | Loren Merritt | cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift |

105 | shl orderq, 1 |
||

106 | add v1q, orderq |
||

107 | add v2q, orderq |
||

108 | neg orderq |
||

109 | movd m3, shiftm |
||

110 | pxor m2, m2 |
||

111 | .loop: |
||

112 | movu m0, [v1q + orderq] |
||

113 | movu m1, [v1q + orderq + mmsize] |
||

114 | pmaddwd m0, [v2q + orderq] |
||

115 | pmaddwd m1, [v2q + orderq + mmsize] |
||

116 | paddd m2, m0 |
||

117 | paddd m2, m1 |
||

118 | add orderq, mmsize*2 |
||

119 | jl .loop |
||

120 | %if mmsize == 16 |
||

121 | movhlps m0, m2 |
||

122 | paddd m2, m0 |
||

123 | psrad m2, m3 |
||

124 | pshuflw m0, m2, 0x4e |
||

125 | %else |
||

126 | psrad m2, m3 |
||

127 | pshufw m0, m2, 0x4e |
||

128 | %endif |
||

129 | paddd m2, m0 |
||

130 | movd eax, m2 |
||

131 | RET |
||

132 | b1159ad9 | Loren Merritt | |

133 | ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) |
||

134 | 758c7455 | Loren Merritt | cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul |

135 | b1159ad9 | Loren Merritt | shl orderq, 1 |

136 | movd m7, mulm |
||

137 | %if mmsize == 16 |
||

138 | pshuflw m7, m7, 0 |
||

139 | punpcklqdq m7, m7 |
||

140 | %else |
||

141 | pshufw m7, m7, 0 |
||

142 | %endif |
||

143 | pxor m6, m6 |
||

144 | add v1q, orderq |
||

145 | add v2q, orderq |
||

146 | add v3q, orderq |
||

147 | neg orderq |
||

148 | .loop: |
||

149 | movu m0, [v2q + orderq] |
||

150 | movu m1, [v2q + orderq + mmsize] |
||

151 | mova m4, [v1q + orderq] |
||

152 | mova m5, [v1q + orderq + mmsize] |
||

153 | movu m2, [v3q + orderq] |
||

154 | movu m3, [v3q + orderq + mmsize] |
||

155 | pmaddwd m0, m4 |
||

156 | pmaddwd m1, m5 |
||

157 | pmullw m2, m7 |
||

158 | pmullw m3, m7 |
||

159 | paddd m6, m0 |
||

160 | paddd m6, m1 |
||

161 | paddw m2, m4 |
||

162 | paddw m3, m5 |
||

163 | mova [v1q + orderq], m2 |
||

164 | mova [v1q + orderq + mmsize], m3 |
||

165 | add orderq, mmsize*2 |
||

166 | jl .loop |
||

167 | %if mmsize == 16 |
||

168 | movhlps m0, m6 |
||

169 | paddd m6, m0 |
||

170 | pshuflw m0, m6, 0x4e |
||

171 | %else |
||

172 | pshufw m0, m6, 0x4e |
||

173 | %endif |
||

174 | paddd m6, m0 |
||

175 | movd eax, m6 |
||

176 | RET |
||

177 | b10fa1bb | Loren Merritt | %endmacro |

178 | |||

179 | INIT_MMX |
||

180 | SCALARPRODUCT mmx2 |
||

181 | INIT_XMM |
||

182 | SCALARPRODUCT sse2 |
||

183 | |||

184 | b1159ad9 | Loren Merritt | %macro SCALARPRODUCT_LOOP 1 |

185 | align 16 |
||

186 | .loop%1: |
||

187 | sub orderq, mmsize*2 |
||

188 | %if %1 |
||

189 | mova m1, m4 |
||

190 | mova m4, [v2q + orderq] |
||

191 | mova m0, [v2q + orderq + mmsize] |
||

192 | palignr m1, m0, %1 |
||

193 | palignr m0, m4, %1 |
||

194 | mova m3, m5 |
||

195 | mova m5, [v3q + orderq] |
||

196 | mova m2, [v3q + orderq + mmsize] |
||

197 | palignr m3, m2, %1 |
||

198 | palignr m2, m5, %1 |
||

199 | %else |
||

200 | mova m0, [v2q + orderq] |
||

201 | mova m1, [v2q + orderq + mmsize] |
||

202 | mova m2, [v3q + orderq] |
||

203 | mova m3, [v3q + orderq + mmsize] |
||

204 | %endif |
||

205 | a4605efd | Loren Merritt | %define t0 [v1q + orderq] |

206 | %define t1 [v1q + orderq + mmsize] |
||

207 | %ifdef ARCH_X86_64 |
||

208 | mova m8, t0 |
||

209 | mova m9, t1 |
||

210 | %define t0 m8 |
||

211 | %define t1 m9 |
||

212 | %endif |
||

213 | pmaddwd m0, t0 |
||

214 | pmaddwd m1, t1 |
||

215 | b1159ad9 | Loren Merritt | pmullw m2, m7 |

216 | pmullw m3, m7 |
||

217 | a4605efd | Loren Merritt | paddw m2, t0 |

218 | paddw m3, t1 |
||

219 | b1159ad9 | Loren Merritt | paddd m6, m0 |

220 | paddd m6, m1 |
||

221 | mova [v1q + orderq], m2 |
||

222 | mova [v1q + orderq + mmsize], m3 |
||

223 | jg .loop%1 |
||

224 | %if %1 |
||

225 | jmp .end |
||

226 | %endif |
||

227 | %endmacro |
||

228 | |||

229 | ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) |
||

230 | a4605efd | Loren Merritt | cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul |

231 | b1159ad9 | Loren Merritt | shl orderq, 1 |

232 | movd m7, mulm |
||

233 | pshuflw m7, m7, 0 |
||

234 | punpcklqdq m7, m7 |
||

235 | pxor m6, m6 |
||

236 | mov r4d, v2d |
||

237 | and r4d, 15 |
||

238 | and v2q, ~15 |
||

239 | and v3q, ~15 |
||

240 | mova m4, [v2q + orderq] |
||

241 | mova m5, [v3q + orderq] |
||

242 | ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) |
||

243 | cmp r4d, 0 |
||

244 | je .loop0 |
||

245 | cmp r4d, 2 |
||

246 | je .loop2 |
||

247 | cmp r4d, 4 |
||

248 | je .loop4 |
||

249 | cmp r4d, 6 |
||

250 | je .loop6 |
||

251 | cmp r4d, 8 |
||

252 | je .loop8 |
||

253 | cmp r4d, 10 |
||

254 | je .loop10 |
||

255 | cmp r4d, 12 |
||

256 | je .loop12 |
||

257 | SCALARPRODUCT_LOOP 14 |
||

258 | SCALARPRODUCT_LOOP 12 |
||

259 | SCALARPRODUCT_LOOP 10 |
||

260 | SCALARPRODUCT_LOOP 8 |
||

261 | SCALARPRODUCT_LOOP 6 |
||

262 | SCALARPRODUCT_LOOP 4 |
||

263 | SCALARPRODUCT_LOOP 2 |
||

264 | SCALARPRODUCT_LOOP 0 |
||

265 | .end: |
||

266 | movhlps m0, m6 |
||

267 | paddd m6, m0 |
||

268 | pshuflw m0, m6, 0x4e |
||

269 | paddd m6, m0 |
||

270 | movd eax, m6 |
||

271 | RET |
||

272 | |||

273 | b10fa1bb | Loren Merritt | |

274 | |||

275 | 2966cc18 | Jason Garrett-Glaser | ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) |

276 | 3daa434a | Loren Merritt | cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top |

277 | movq mm0, [topq] |
||

278 | movq mm2, mm0 |
||

279 | movd mm4, [left_topq] |
||

280 | psllq mm2, 8 |
||

281 | movq mm1, mm0 |
||

282 | por mm4, mm2 |
||

283 | movd mm3, [leftq] |
||

284 | psubb mm0, mm4 ; t-tl |
||

285 | add dstq, wq |
||

286 | add topq, wq |
||

287 | add diffq, wq |
||

288 | neg wq |
||

289 | jmp .skip |
||

290 | .loop: |
||

291 | movq mm4, [topq+wq] |
||

292 | movq mm0, mm4 |
||

293 | psllq mm4, 8 |
||

294 | por mm4, mm1 |
||

295 | movq mm1, mm0 ; t |
||

296 | psubb mm0, mm4 ; t-tl |
||

297 | .skip: |
||

298 | movq mm2, [diffq+wq] |
||

299 | %assign i 0 |
||

300 | %rep 8 |
||

301 | movq mm4, mm0 |
||

302 | paddb mm4, mm3 ; t-tl+l |
||

303 | movq mm5, mm3 |
||

304 | pmaxub mm3, mm1 |
||

305 | pminub mm5, mm1 |
||

306 | pminub mm3, mm4 |
||

307 | pmaxub mm3, mm5 ; median |
||

308 | paddb mm3, mm2 ; +residual |
||

309 | %if i==0 |
||

310 | movq mm7, mm3 |
||

311 | psllq mm7, 56 |
||

312 | %else |
||

313 | movq mm6, mm3 |
||

314 | psrlq mm7, 8 |
||

315 | psllq mm6, 56 |
||

316 | por mm7, mm6 |
||

317 | %endif |
||

318 | %if i<7 |
||

319 | psrlq mm0, 8 |
||

320 | psrlq mm1, 8 |
||

321 | psrlq mm2, 8 |
||

322 | %endif |
||

323 | %assign i i+1 |
||

324 | %endrep |
||

325 | movq [dstq+wq], mm7 |
||

326 | add wq, 8 |
||

327 | jl .loop |
||

328 | movzx r2d, byte [dstq-1] |
||

329 | mov [leftq], r2d |
||

330 | movzx r2d, byte [topq-1] |
||

331 | mov [left_topq], r2d |
||

332 | RET |
||

333 | 2f77923d | Loren Merritt | |

334 | |||

335 | %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned |
||

336 | add srcq, wq |
||

337 | add dstq, wq |
||

338 | neg wq |
||

339 | %%.loop: |
||

340 | mova m1, [srcq+wq] |
||

341 | mova m2, m1 |
||

342 | psllw m1, 8 |
||

343 | paddb m1, m2 |
||

344 | mova m2, m1 |
||

345 | pshufb m1, m3 |
||

346 | paddb m1, m2 |
||

347 | pshufb m0, m5 |
||

348 | mova m2, m1 |
||

349 | pshufb m1, m4 |
||

350 | paddb m1, m2 |
||

351 | %if mmsize == 16 |
||

352 | mova m2, m1 |
||

353 | pshufb m1, m6 |
||

354 | paddb m1, m2 |
||

355 | %endif |
||

356 | paddb m0, m1 |
||

357 | %if %1 |
||

358 | mova [dstq+wq], m0 |
||

359 | %else |
||

360 | movq [dstq+wq], m0 |
||

361 | movhps [dstq+wq+8], m0 |
||

362 | %endif |
||

363 | add wq, mmsize |
||

364 | jl %%.loop |
||

365 | mov eax, mmsize-1 |
||

366 | sub eax, wd |
||

367 | movd m1, eax |
||

368 | pshufb m0, m1 |
||

369 | movd eax, m0 |
||

370 | RET |
||

371 | %endmacro |
||

372 | |||

373 | 2966cc18 | Jason Garrett-Glaser | ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) |

374 | 2f77923d | Loren Merritt | INIT_MMX |

375 | cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left |
||

376 | .skip_prologue: |
||

377 | 2966cc18 | Jason Garrett-Glaser | mova m5, [pb_7] |

378 | mova m4, [pb_zzzz3333zzzzbbbb] |
||

379 | mova m3, [pb_zz11zz55zz99zzdd] |
||

380 | 2f77923d | Loren Merritt | movd m0, leftm |

381 | psllq m0, 56 |
||

382 | ADD_HFYU_LEFT_LOOP 1 |
||

383 | |||

384 | INIT_XMM |
||

385 | cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left |
||

386 | 2966cc18 | Jason Garrett-Glaser | mova m5, [pb_f] |

387 | mova m6, [pb_zzzzzzzz77777777] |
||

388 | mova m4, [pb_zzzz3333zzzzbbbb] |
||

389 | mova m3, [pb_zz11zz55zz99zzdd] |
||

390 | 2f77923d | Loren Merritt | movd m0, leftm |

391 | pslldq m0, 15 |
||

392 | test srcq, 15 |
||

393 | b07781b6 | Loren Merritt | jnz add_hfyu_left_prediction_ssse3.skip_prologue |

394 | 2f77923d | Loren Merritt | test dstq, 15 |

395 | jnz .unaligned |
||

396 | ADD_HFYU_LEFT_LOOP 1 |
||

397 | .unaligned: |
||

398 | ADD_HFYU_LEFT_LOOP 0 |
||

399 | |||

400 | 3deb5384 | Alex Converse | |

401 | 2966cc18 | Jason Garrett-Glaser | ; float scalarproduct_float_sse(const float *v1, const float *v2, int len) |

402 | 3deb5384 | Alex Converse | cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset |

403 | neg offsetq |
||

404 | shl offsetq, 2 |
||

405 | sub v1q, offsetq |
||

406 | sub v2q, offsetq |
||

407 | xorps xmm0, xmm0 |
||

408 | .loop: |
||

409 | movaps xmm1, [v1q+offsetq] |
||

410 | mulps xmm1, [v2q+offsetq] |
||

411 | addps xmm0, xmm1 |
||

412 | add offsetq, 16 |
||

413 | js .loop |
||

414 | movhlps xmm1, xmm0 |
||

415 | addps xmm0, xmm1 |
||

416 | movss xmm1, xmm0 |
||

417 | shufps xmm0, xmm0, 1 |
||

418 | addss xmm0, xmm1 |
||

419 | %ifndef ARCH_X86_64 |
||

420 | movd r0m, xmm0 |
||

421 | fld dword r0m |
||

422 | %endif |
||

423 | RET |