## ffmpeg / libavcodec / x86 / dsputil_yasm.asm @ b10fa1bb

History | View | Annotate | Download (7.19 KB)

1 |
;****************************************************************************** |
---|---|

2 |
;* MMX optimized DSP utils |

3 |
;* Copyright (c) 2008 Loren Merritt |

4 |
;* |

5 |
;* This file is part of FFmpeg. |

6 |
;* |

7 |
;* FFmpeg is free software; you can redistribute it and/or |

8 |
;* modify it under the terms of the GNU Lesser General Public |

9 |
;* License as published by the Free Software Foundation; either |

10 |
;* version 2.1 of the License, or (at your option) any later version. |

11 |
;* |

12 |
;* FFmpeg is distributed in the hope that it will be useful, |

13 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

14 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

15 |
;* Lesser General Public License for more details. |

16 |
;* |

17 |
;* You should have received a copy of the GNU Lesser General Public |

18 |
;* License along with FFmpeg; if not, write to the Free Software |

19 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

20 |
;****************************************************************************** |

21 | |

22 |
%include "x86inc.asm" |

23 | |

24 |
SECTION_RODATA |

25 |
pb_f: times 16 db 15 |

26 |
pb_zzzzzzzz77777777: times 8 db -1 |

27 |
pb_7: times 8 db 7 |

28 |
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 |

29 |
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 |

30 | |

31 |
section .text align=16 |

32 | |

33 |
%macro PSWAPD_SSE 2 |

34 |
pshufw %1, %2, 0x4e |

35 |
%endmacro |

36 |
%macro PSWAPD_3DN1 2 |

37 |
movq %1, %2 |

38 |
psrlq %1, 32 |

39 |
punpckldq %1, %2 |

40 |
%endmacro |

41 | |

42 |
%macro FLOAT_TO_INT16_INTERLEAVE6 1 |

43 |
; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) |

44 |
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 |

45 |
%ifdef ARCH_X86_64 |

46 |
%define lend r10d |

47 |
mov lend, r2d |

48 |
%else |

49 |
%define lend dword r2m |

50 |
%endif |

51 |
mov src1q, [srcq+1*gprsize] |

52 |
mov src2q, [srcq+2*gprsize] |

53 |
mov src3q, [srcq+3*gprsize] |

54 |
mov src4q, [srcq+4*gprsize] |

55 |
mov src5q, [srcq+5*gprsize] |

56 |
mov srcq, [srcq] |

57 |
sub src1q, srcq |

58 |
sub src2q, srcq |

59 |
sub src3q, srcq |

60 |
sub src4q, srcq |

61 |
sub src5q, srcq |

62 |
.loop: |

63 |
cvtps2pi mm0, [srcq] |

64 |
cvtps2pi mm1, [srcq+src1q] |

65 |
cvtps2pi mm2, [srcq+src2q] |

66 |
cvtps2pi mm3, [srcq+src3q] |

67 |
cvtps2pi mm4, [srcq+src4q] |

68 |
cvtps2pi mm5, [srcq+src5q] |

69 |
packssdw mm0, mm3 |

70 |
packssdw mm1, mm4 |

71 |
packssdw mm2, mm5 |

72 |
pswapd mm3, mm0 |

73 |
punpcklwd mm0, mm1 |

74 |
punpckhwd mm1, mm2 |

75 |
punpcklwd mm2, mm3 |

76 |
pswapd mm3, mm0 |

77 |
punpckldq mm0, mm2 |

78 |
punpckhdq mm2, mm1 |

79 |
punpckldq mm1, mm3 |

80 |
movq [dstq ], mm0 |

81 |
movq [dstq+16], mm2 |

82 |
movq [dstq+ 8], mm1 |

83 |
add srcq, 8 |

84 |
add dstq, 24 |

85 |
sub lend, 2 |

86 |
jg .loop |

87 |
emms |

88 |
RET |

89 |
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 |

90 | |

91 |
%define pswapd PSWAPD_SSE |

92 |
FLOAT_TO_INT16_INTERLEAVE6 sse |

93 |
%define cvtps2pi pf2id |

94 |
%define pswapd PSWAPD_3DN1 |

95 |
FLOAT_TO_INT16_INTERLEAVE6 3dnow |

96 |
%undef pswapd |

97 |
FLOAT_TO_INT16_INTERLEAVE6 3dn2 |

98 |
%undef cvtps2pi |

99 | |

100 | |

101 | |

102 |
%macro SCALARPRODUCT 1 |

103 |
; void add_int16(int16_t * v1, int16_t * v2, int order) |

104 |
cglobal add_int16_%1, 3,3,2, v1, v2, order |

105 |
shl orderq, 1 |

106 |
add v1q, orderq |

107 |
add v2q, orderq |

108 |
neg orderq |

109 |
.loop: |

110 |
movu m0, [v2q + orderq] |

111 |
movu m1, [v2q + orderq + mmsize] |

112 |
paddw m0, [v1q + orderq] |

113 |
paddw m1, [v1q + orderq + mmsize] |

114 |
mova [v1q + orderq], m0 |

115 |
mova [v1q + orderq + mmsize], m1 |

116 |
add orderq, mmsize*2 |

117 |
jl .loop |

118 |
REP_RET |

119 | |

120 |
; void sub_int16(int16_t * v1, int16_t * v2, int order) |

121 |
cglobal sub_int16_%1, 3,3,4, v1, v2, order |

122 |
shl orderq, 1 |

123 |
add v1q, orderq |

124 |
add v2q, orderq |

125 |
neg orderq |

126 |
.loop: |

127 |
movu m2, [v2q + orderq] |

128 |
movu m3, [v2q + orderq + mmsize] |

129 |
mova m0, [v1q + orderq] |

130 |
mova m1, [v1q + orderq + mmsize] |

131 |
psubw m0, m2 |

132 |
psubw m1, m3 |

133 |
mova [v1q + orderq], m0 |

134 |
mova [v1q + orderq + mmsize], m1 |

135 |
add orderq, mmsize*2 |

136 |
jl .loop |

137 |
REP_RET |

138 | |

139 |
; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift) |

140 |
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift |

141 |
shl orderq, 1 |

142 |
add v1q, orderq |

143 |
add v2q, orderq |

144 |
neg orderq |

145 |
movd m3, shiftm |

146 |
pxor m2, m2 |

147 |
.loop: |

148 |
movu m0, [v1q + orderq] |

149 |
movu m1, [v1q + orderq + mmsize] |

150 |
pmaddwd m0, [v2q + orderq] |

151 |
pmaddwd m1, [v2q + orderq + mmsize] |

152 |
paddd m2, m0 |

153 |
paddd m2, m1 |

154 |
add orderq, mmsize*2 |

155 |
jl .loop |

156 |
%if mmsize == 16 |

157 |
movhlps m0, m2 |

158 |
paddd m2, m0 |

159 |
psrad m2, m3 |

160 |
pshuflw m0, m2, 0x4e |

161 |
%else |

162 |
psrad m2, m3 |

163 |
pshufw m0, m2, 0x4e |

164 |
%endif |

165 |
paddd m2, m0 |

166 |
movd eax, m2 |

167 |
RET |

168 |
%endmacro |

169 | |

170 |
INIT_MMX |

171 |
SCALARPRODUCT mmx2 |

172 |
INIT_XMM |

173 |
SCALARPRODUCT sse2 |

174 | |

175 | |

176 | |

177 |
; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) |

178 |
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top |

179 |
movq mm0, [topq] |

180 |
movq mm2, mm0 |

181 |
movd mm4, [left_topq] |

182 |
psllq mm2, 8 |

183 |
movq mm1, mm0 |

184 |
por mm4, mm2 |

185 |
movd mm3, [leftq] |

186 |
psubb mm0, mm4 ; t-tl |

187 |
add dstq, wq |

188 |
add topq, wq |

189 |
add diffq, wq |

190 |
neg wq |

191 |
jmp .skip |

192 |
.loop: |

193 |
movq mm4, [topq+wq] |

194 |
movq mm0, mm4 |

195 |
psllq mm4, 8 |

196 |
por mm4, mm1 |

197 |
movq mm1, mm0 ; t |

198 |
psubb mm0, mm4 ; t-tl |

199 |
.skip: |

200 |
movq mm2, [diffq+wq] |

201 |
%assign i 0 |

202 |
%rep 8 |

203 |
movq mm4, mm0 |

204 |
paddb mm4, mm3 ; t-tl+l |

205 |
movq mm5, mm3 |

206 |
pmaxub mm3, mm1 |

207 |
pminub mm5, mm1 |

208 |
pminub mm3, mm4 |

209 |
pmaxub mm3, mm5 ; median |

210 |
paddb mm3, mm2 ; +residual |

211 |
%if i==0 |

212 |
movq mm7, mm3 |

213 |
psllq mm7, 56 |

214 |
%else |

215 |
movq mm6, mm3 |

216 |
psrlq mm7, 8 |

217 |
psllq mm6, 56 |

218 |
por mm7, mm6 |

219 |
%endif |

220 |
%if i<7 |

221 |
psrlq mm0, 8 |

222 |
psrlq mm1, 8 |

223 |
psrlq mm2, 8 |

224 |
%endif |

225 |
%assign i i+1 |

226 |
%endrep |

227 |
movq [dstq+wq], mm7 |

228 |
add wq, 8 |

229 |
jl .loop |

230 |
movzx r2d, byte [dstq-1] |

231 |
mov [leftq], r2d |

232 |
movzx r2d, byte [topq-1] |

233 |
mov [left_topq], r2d |

234 |
RET |

235 | |

236 | |

237 |
%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned |

238 |
add srcq, wq |

239 |
add dstq, wq |

240 |
neg wq |

241 |
%%.loop: |

242 |
mova m1, [srcq+wq] |

243 |
mova m2, m1 |

244 |
psllw m1, 8 |

245 |
paddb m1, m2 |

246 |
mova m2, m1 |

247 |
pshufb m1, m3 |

248 |
paddb m1, m2 |

249 |
pshufb m0, m5 |

250 |
mova m2, m1 |

251 |
pshufb m1, m4 |

252 |
paddb m1, m2 |

253 |
%if mmsize == 16 |

254 |
mova m2, m1 |

255 |
pshufb m1, m6 |

256 |
paddb m1, m2 |

257 |
%endif |

258 |
paddb m0, m1 |

259 |
%if %1 |

260 |
mova [dstq+wq], m0 |

261 |
%else |

262 |
movq [dstq+wq], m0 |

263 |
movhps [dstq+wq+8], m0 |

264 |
%endif |

265 |
add wq, mmsize |

266 |
jl %%.loop |

267 |
mov eax, mmsize-1 |

268 |
sub eax, wd |

269 |
movd m1, eax |

270 |
pshufb m0, m1 |

271 |
movd eax, m0 |

272 |
RET |

273 |
%endmacro |

274 | |

275 |
; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) |

276 |
INIT_MMX |

277 |
cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left |

278 |
.skip_prologue: |

279 |
mova m5, [pb_7 GLOBAL] |

280 |
mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] |

281 |
mova m3, [pb_zz11zz55zz99zzdd GLOBAL] |

282 |
movd m0, leftm |

283 |
psllq m0, 56 |

284 |
ADD_HFYU_LEFT_LOOP 1 |

285 | |

286 |
INIT_XMM |

287 |
cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left |

288 |
mova m5, [pb_f GLOBAL] |

289 |
mova m6, [pb_zzzzzzzz77777777 GLOBAL] |

290 |
mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] |

291 |
mova m3, [pb_zz11zz55zz99zzdd GLOBAL] |

292 |
movd m0, leftm |

293 |
pslldq m0, 15 |

294 |
test srcq, 15 |

295 |
jnz add_hfyu_left_prediction_ssse3.skip_prologue |

296 |
test dstq, 15 |

297 |
jnz .unaligned |

298 |
ADD_HFYU_LEFT_LOOP 1 |

299 |
.unaligned: |

300 |
ADD_HFYU_LEFT_LOOP 0 |

301 |