## ffmpeg / libavcodec / x86 / dsputil_yasm.asm @ b10fa1bb

History | View | Annotate | Download (7.19 KB)

1 | 7ca7d5fa | Loren Merritt | ;****************************************************************************** |
---|---|---|---|

2 | ;* MMX optimized DSP utils |
||

3 | ;* Copyright (c) 2008 Loren Merritt |
||

4 | ;* |
||

5 | ;* This file is part of FFmpeg. |
||

6 | ;* |
||

7 | ;* FFmpeg is free software; you can redistribute it and/or |
||

8 | ;* modify it under the terms of the GNU Lesser General Public |
||

9 | ;* License as published by the Free Software Foundation; either |
||

10 | ;* version 2.1 of the License, or (at your option) any later version. |
||

11 | ;* |
||

12 | ;* FFmpeg is distributed in the hope that it will be useful, |
||

13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||

14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||

15 | ;* Lesser General Public License for more details. |
||

16 | ;* |
||

17 | ;* You should have received a copy of the GNU Lesser General Public |
||

18 | ;* License along with FFmpeg; if not, write to the Free Software |
||

19 | ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||

20 | ;****************************************************************************** |
||

21 | |||

22 | %include "x86inc.asm" |
||

23 | |||

24 | 2f77923d | Loren Merritt | SECTION_RODATA |

25 | pb_f: times 16 db 15 |
||

26 | pb_zzzzzzzz77777777: times 8 db -1 |
||

27 | pb_7: times 8 db 7 |
||

28 | pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 |
||

29 | pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 |
||

30 | |||

31 | 7ca7d5fa | Loren Merritt | section .text align=16 |

32 | |||

33 | %macro PSWAPD_SSE 2 |
||

34 | pshufw %1, %2, 0x4e |
||

35 | %endmacro |
||

36 | %macro PSWAPD_3DN1 2 |
||

37 | movq %1, %2 |
||

38 | psrlq %1, 32 |
||

39 | punpckldq %1, %2 |
||

40 | %endmacro |
||

41 | |||

42 | %macro FLOAT_TO_INT16_INTERLEAVE6 1 |
||

43 | ; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) |
||

44 | 40c7d0ae | Jason Garrett-Glaser | cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 |

45 | 7ca7d5fa | Loren Merritt | %ifdef ARCH_X86_64 |

46 | %define lend r10d |
||

47 | mov lend, r2d |
||

48 | %else |
||

49 | %define lend dword r2m |
||

50 | %endif |
||

51 | mov src1q, [srcq+1*gprsize] |
||

52 | mov src2q, [srcq+2*gprsize] |
||

53 | mov src3q, [srcq+3*gprsize] |
||

54 | mov src4q, [srcq+4*gprsize] |
||

55 | mov src5q, [srcq+5*gprsize] |
||

56 | mov srcq, [srcq] |
||

57 | sub src1q, srcq |
||

58 | sub src2q, srcq |
||

59 | sub src3q, srcq |
||

60 | sub src4q, srcq |
||

61 | sub src5q, srcq |
||

62 | .loop: |
||

63 | cvtps2pi mm0, [srcq] |
||

64 | cvtps2pi mm1, [srcq+src1q] |
||

65 | cvtps2pi mm2, [srcq+src2q] |
||

66 | cvtps2pi mm3, [srcq+src3q] |
||

67 | cvtps2pi mm4, [srcq+src4q] |
||

68 | cvtps2pi mm5, [srcq+src5q] |
||

69 | packssdw mm0, mm3 |
||

70 | packssdw mm1, mm4 |
||

71 | packssdw mm2, mm5 |
||

72 | pswapd mm3, mm0 |
||

73 | punpcklwd mm0, mm1 |
||

74 | punpckhwd mm1, mm2 |
||

75 | punpcklwd mm2, mm3 |
||

76 | pswapd mm3, mm0 |
||

77 | punpckldq mm0, mm2 |
||

78 | punpckhdq mm2, mm1 |
||

79 | punpckldq mm1, mm3 |
||

80 | movq [dstq ], mm0 |
||

81 | movq [dstq+16], mm2 |
||

82 | movq [dstq+ 8], mm1 |
||

83 | add srcq, 8 |
||

84 | add dstq, 24 |
||

85 | sub lend, 2 |
||

86 | jg .loop |
||

87 | emms |
||

88 | RET |
||

89 | %endmacro ; FLOAT_TO_INT16_INTERLEAVE6 |
||

90 | |||

91 | %define pswapd PSWAPD_SSE |
||

92 | FLOAT_TO_INT16_INTERLEAVE6 sse |
||

93 | %define cvtps2pi pf2id |
||

94 | %define pswapd PSWAPD_3DN1 |
||

95 | FLOAT_TO_INT16_INTERLEAVE6 3dnow |
||

96 | %undef pswapd |
||

97 | FLOAT_TO_INT16_INTERLEAVE6 3dn2 |
||

98 | %undef cvtps2pi |
||

99 | |||

100 | 3daa434a | Loren Merritt | |

101 | |||

102 | b10fa1bb | Loren Merritt | %macro SCALARPRODUCT 1 |

103 | ; void add_int16(int16_t * v1, int16_t * v2, int order) |
||

104 | cglobal add_int16_%1, 3,3,2, v1, v2, order |
||

105 | shl orderq, 1 |
||

106 | add v1q, orderq |
||

107 | add v2q, orderq |
||

108 | neg orderq |
||

109 | .loop: |
||

110 | movu m0, [v2q + orderq] |
||

111 | movu m1, [v2q + orderq + mmsize] |
||

112 | paddw m0, [v1q + orderq] |
||

113 | paddw m1, [v1q + orderq + mmsize] |
||

114 | mova [v1q + orderq], m0 |
||

115 | mova [v1q + orderq + mmsize], m1 |
||

116 | add orderq, mmsize*2 |
||

117 | jl .loop |
||

118 | REP_RET |
||

119 | |||

120 | ; void sub_int16(int16_t * v1, int16_t * v2, int order) |
||

121 | cglobal sub_int16_%1, 3,3,4, v1, v2, order |
||

122 | shl orderq, 1 |
||

123 | add v1q, orderq |
||

124 | add v2q, orderq |
||

125 | neg orderq |
||

126 | .loop: |
||

127 | movu m2, [v2q + orderq] |
||

128 | movu m3, [v2q + orderq + mmsize] |
||

129 | mova m0, [v1q + orderq] |
||

130 | mova m1, [v1q + orderq + mmsize] |
||

131 | psubw m0, m2 |
||

132 | psubw m1, m3 |
||

133 | mova [v1q + orderq], m0 |
||

134 | mova [v1q + orderq + mmsize], m1 |
||

135 | add orderq, mmsize*2 |
||

136 | jl .loop |
||

137 | REP_RET |
||

138 | |||

139 | ; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift) |
||

140 | cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift |
||

141 | shl orderq, 1 |
||

142 | add v1q, orderq |
||

143 | add v2q, orderq |
||

144 | neg orderq |
||

145 | movd m3, shiftm |
||

146 | pxor m2, m2 |
||

147 | .loop: |
||

148 | movu m0, [v1q + orderq] |
||

149 | movu m1, [v1q + orderq + mmsize] |
||

150 | pmaddwd m0, [v2q + orderq] |
||

151 | pmaddwd m1, [v2q + orderq + mmsize] |
||

152 | paddd m2, m0 |
||

153 | paddd m2, m1 |
||

154 | add orderq, mmsize*2 |
||

155 | jl .loop |
||

156 | %if mmsize == 16 |
||

157 | movhlps m0, m2 |
||

158 | paddd m2, m0 |
||

159 | psrad m2, m3 |
||

160 | pshuflw m0, m2, 0x4e |
||

161 | %else |
||

162 | psrad m2, m3 |
||

163 | pshufw m0, m2, 0x4e |
||

164 | %endif |
||

165 | paddd m2, m0 |
||

166 | movd eax, m2 |
||

167 | RET |
||

168 | %endmacro |
||

169 | |||

170 | INIT_MMX |
||

171 | SCALARPRODUCT mmx2 |
||

172 | INIT_XMM |
||

173 | SCALARPRODUCT sse2 |
||

174 | |||

175 | |||

176 | |||

177 | e17ccf60 | Loren Merritt | ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) |

178 | 3daa434a | Loren Merritt | cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top |

179 | movq mm0, [topq] |
||

180 | movq mm2, mm0 |
||

181 | movd mm4, [left_topq] |
||

182 | psllq mm2, 8 |
||

183 | movq mm1, mm0 |
||

184 | por mm4, mm2 |
||

185 | movd mm3, [leftq] |
||

186 | psubb mm0, mm4 ; t-tl |
||

187 | add dstq, wq |
||

188 | add topq, wq |
||

189 | add diffq, wq |
||

190 | neg wq |
||

191 | jmp .skip |
||

192 | .loop: |
||

193 | movq mm4, [topq+wq] |
||

194 | movq mm0, mm4 |
||

195 | psllq mm4, 8 |
||

196 | por mm4, mm1 |
||

197 | movq mm1, mm0 ; t |
||

198 | psubb mm0, mm4 ; t-tl |
||

199 | .skip: |
||

200 | movq mm2, [diffq+wq] |
||

201 | %assign i 0 |
||

202 | %rep 8 |
||

203 | movq mm4, mm0 |
||

204 | paddb mm4, mm3 ; t-tl+l |
||

205 | movq mm5, mm3 |
||

206 | pmaxub mm3, mm1 |
||

207 | pminub mm5, mm1 |
||

208 | pminub mm3, mm4 |
||

209 | pmaxub mm3, mm5 ; median |
||

210 | paddb mm3, mm2 ; +residual |
||

211 | %if i==0 |
||

212 | movq mm7, mm3 |
||

213 | psllq mm7, 56 |
||

214 | %else |
||

215 | movq mm6, mm3 |
||

216 | psrlq mm7, 8 |
||

217 | psllq mm6, 56 |
||

218 | por mm7, mm6 |
||

219 | %endif |
||

220 | %if i<7 |
||

221 | psrlq mm0, 8 |
||

222 | psrlq mm1, 8 |
||

223 | psrlq mm2, 8 |
||

224 | %endif |
||

225 | %assign i i+1 |
||

226 | %endrep |
||

227 | movq [dstq+wq], mm7 |
||

228 | add wq, 8 |
||

229 | jl .loop |
||

230 | movzx r2d, byte [dstq-1] |
||

231 | mov [leftq], r2d |
||

232 | movzx r2d, byte [topq-1] |
||

233 | mov [left_topq], r2d |
||

234 | RET |
||

235 | 2f77923d | Loren Merritt | |

236 | |||

237 | %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned |
||

238 | add srcq, wq |
||

239 | add dstq, wq |
||

240 | neg wq |
||

241 | %%.loop: |
||

242 | mova m1, [srcq+wq] |
||

243 | mova m2, m1 |
||

244 | psllw m1, 8 |
||

245 | paddb m1, m2 |
||

246 | mova m2, m1 |
||

247 | pshufb m1, m3 |
||

248 | paddb m1, m2 |
||

249 | pshufb m0, m5 |
||

250 | mova m2, m1 |
||

251 | pshufb m1, m4 |
||

252 | paddb m1, m2 |
||

253 | %if mmsize == 16 |
||

254 | mova m2, m1 |
||

255 | pshufb m1, m6 |
||

256 | paddb m1, m2 |
||

257 | %endif |
||

258 | paddb m0, m1 |
||

259 | %if %1 |
||

260 | mova [dstq+wq], m0 |
||

261 | %else |
||

262 | movq [dstq+wq], m0 |
||

263 | movhps [dstq+wq+8], m0 |
||

264 | %endif |
||

265 | add wq, mmsize |
||

266 | jl %%.loop |
||

267 | mov eax, mmsize-1 |
||

268 | sub eax, wd |
||

269 | movd m1, eax |
||

270 | pshufb m0, m1 |
||

271 | movd eax, m0 |
||

272 | RET |
||

273 | %endmacro |
||

274 | |||

275 | e17ccf60 | Loren Merritt | ; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) |

276 | 2f77923d | Loren Merritt | INIT_MMX |

277 | cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left |
||

278 | .skip_prologue: |
||

279 | mova m5, [pb_7 GLOBAL] |
||

280 | mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] |
||

281 | mova m3, [pb_zz11zz55zz99zzdd GLOBAL] |
||

282 | movd m0, leftm |
||

283 | psllq m0, 56 |
||

284 | ADD_HFYU_LEFT_LOOP 1 |
||

285 | |||

286 | INIT_XMM |
||

287 | cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left |
||

288 | mova m5, [pb_f GLOBAL] |
||

289 | mova m6, [pb_zzzzzzzz77777777 GLOBAL] |
||

290 | mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] |
||

291 | mova m3, [pb_zz11zz55zz99zzdd GLOBAL] |
||

292 | movd m0, leftm |
||

293 | pslldq m0, 15 |
||

294 | test srcq, 15 |
||

295 | b07781b6 | Loren Merritt | jnz add_hfyu_left_prediction_ssse3.skip_prologue |

296 | 2f77923d | Loren Merritt | test dstq, 15 |

297 | jnz .unaligned |
||

298 | ADD_HFYU_LEFT_LOOP 1 |
||

299 | .unaligned: |
||

300 | ADD_HFYU_LEFT_LOOP 0 |