## ffmpeg / libavcodec / x86 / vp3dsp.asm @ b1c32fb5

History | View | Annotate | Download (20.7 KB)

1 |
;****************************************************************************** |
---|---|

2 |
;* MMX/SSE2-optimized functions for the VP3 decoder |

3 |
;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> |

4 |
;* |

5 |
;* This file is part of FFmpeg. |

6 |
;* |

7 |
;* FFmpeg is free software; you can redistribute it and/or |

8 |
;* modify it under the terms of the GNU Lesser General Public |

9 |
;* License as published by the Free Software Foundation; either |

10 |
;* version 2.1 of the License, or (at your option) any later version. |

11 |
;* |

12 |
;* FFmpeg is distributed in the hope that it will be useful, |

13 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

14 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

15 |
;* Lesser General Public License for more details. |

16 |
;* |

17 |
;* You should have received a copy of the GNU Lesser General Public |

18 |
;* License along with FFmpeg; if not, write to the Free Software |

19 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

20 |
;****************************************************************************** |

21 | |

22 |
%include "x86inc.asm" |

23 |
%include "x86util.asm" |

24 | |

25 |
; MMX-optimized functions cribbed from the original VP3 source code. |

26 | |

27 |
SECTION_RODATA |

28 | |

29 |
vp3_idct_data: times 8 dw 64277 |

30 |
times 8 dw 60547 |

31 |
times 8 dw 54491 |

32 |
times 8 dw 46341 |

33 |
times 8 dw 36410 |

34 |
times 8 dw 25080 |

35 |
times 8 dw 12785 |

36 | |

37 |
cextern pb_1 |

38 |
cextern pb_3 |

39 |
cextern pb_7 |

40 |
cextern pb_1F |

41 |
cextern pb_81 |

42 | |

43 |
cextern pw_8 |

44 | |

45 |
cextern put_signed_pixels_clamped_mmx |

46 |
cextern add_pixels_clamped_mmx |

47 | |

48 |
SECTION .text |

49 | |

50 |
; this is off by one or two for some cases when filter_limit is greater than 63 |

51 |
; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 |

52 |
; out: p1 in mm4, p2 in mm3 |

53 |
%macro VP3_LOOP_FILTER 0 |

54 |
movq m7, m6 |

55 |
pand m6, [pb_7] ; p0&7 |

56 |
psrlw m7, 3 |

57 |
pand m7, [pb_1F] ; p0>>3 |

58 |
movq m3, m2 ; p2 |

59 |
pxor m2, m4 |

60 |
pand m2, [pb_1] ; (p2^p1)&1 |

61 |
movq m5, m2 |

62 |
paddb m2, m2 |

63 |
paddb m2, m5 ; 3*(p2^p1)&1 |

64 |
paddb m2, m6 ; extra bits lost in shifts |

65 |
pcmpeqb m0, m0 |

66 |
pxor m1, m0 ; 255 - p3 |

67 |
pavgb m1, m2 ; (256 - p3 + extrabits) >> 1 |

68 |
pxor m0, m4 ; 255 - p1 |

69 |
pavgb m0, m3 ; (256 + p2-p1) >> 1 |

70 |
paddb m1, [pb_3] |

71 |
pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2 |

72 |
pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3 |

73 |
paddusb m7, m1 ; d+128+1 |

74 |
movq m6, [pb_81] |

75 |
psubusb m6, m7 |

76 |
psubusb m7, [pb_81] |

77 | |

78 |
movq m5, [r2+516] ; flim |

79 |
pminub m6, m5 |

80 |
pminub m7, m5 |

81 |
movq m0, m6 |

82 |
movq m1, m7 |

83 |
paddb m6, m6 |

84 |
paddb m7, m7 |

85 |
pminub m6, m5 |

86 |
pminub m7, m5 |

87 |
psubb m6, m0 |

88 |
psubb m7, m1 |

89 |
paddusb m4, m7 |

90 |
psubusb m4, m6 |

91 |
psubusb m3, m7 |

92 |
paddusb m3, m6 |

93 |
%endmacro |

94 | |

95 |
%macro STORE_4_WORDS 1 |

96 |
movd r2d, %1 |

97 |
mov [r0 -1], r2w |

98 |
psrlq %1, 32 |

99 |
shr r2, 16 |

100 |
mov [r0+r1 -1], r2w |

101 |
movd r2d, %1 |

102 |
mov [r0+r1*2-1], r2w |

103 |
shr r2, 16 |

104 |
mov [r0+r3 -1], r2w |

105 |
%endmacro |

106 | |

107 |
INIT_MMX |

108 |
cglobal vp3_v_loop_filter_mmx2, 3, 4 |

109 |
%ifdef ARCH_X86_64 |

110 |
movsxd r1, r1d |

111 |
%endif |

112 |
mov r3, r1 |

113 |
neg r1 |

114 |
movq m6, [r0+r1*2] |

115 |
movq m4, [r0+r1 ] |

116 |
movq m2, [r0 ] |

117 |
movq m1, [r0+r3 ] |

118 | |

119 |
VP3_LOOP_FILTER |

120 | |

121 |
movq [r0+r1], m4 |

122 |
movq [r0 ], m3 |

123 |
RET |

124 | |

125 |
cglobal vp3_h_loop_filter_mmx2, 3, 4 |

126 |
%ifdef ARCH_X86_64 |

127 |
movsxd r1, r1d |

128 |
%endif |

129 |
lea r3, [r1*3] |

130 | |

131 |
movd m6, [r0 -2] |

132 |
movd m4, [r0+r1 -2] |

133 |
movd m2, [r0+r1*2-2] |

134 |
movd m1, [r0+r3 -2] |

135 |
lea r0, [r0+r1*4 ] |

136 |
punpcklbw m6, [r0 -2] |

137 |
punpcklbw m4, [r0+r1 -2] |

138 |
punpcklbw m2, [r0+r1*2-2] |

139 |
punpcklbw m1, [r0+r3 -2] |

140 |
sub r0, r3 |

141 |
sub r0, r1 |

142 | |

143 |
TRANSPOSE4x4B 6, 4, 2, 1, 0 |

144 |
VP3_LOOP_FILTER |

145 |
SBUTTERFLY bw, 4, 3, 5 |

146 | |

147 |
STORE_4_WORDS m4 |

148 |
lea r0, [r0+r1*4 ] |

149 |
STORE_4_WORDS m3 |

150 |
RET |

151 | |

152 |
; from original comments: The Macro does IDct on 4 1-D Dcts |

153 |
%macro BeginIDCT 0 |

154 |
movq m2, I(3) |

155 |
movq m6, C(3) |

156 |
movq m4, m2 |

157 |
movq m7, J(5) |

158 |
pmulhw m4, m6 ; r4 = c3*i3 - i3 |

159 |
movq m1, C(5) |

160 |
pmulhw m6, m7 ; r6 = c3*i5 - i5 |

161 |
movq m5, m1 |

162 |
pmulhw m1, m2 ; r1 = c5*i3 - i3 |

163 |
movq m3, I(1) |

164 |
pmulhw m5, m7 ; r5 = c5*i5 - i5 |

165 |
movq m0, C(1) |

166 |
paddw m4, m2 ; r4 = c3*i3 |

167 |
paddw m6, m7 ; r6 = c3*i5 |

168 |
paddw m2, m1 ; r2 = c5*i3 |

169 |
movq m1, J(7) |

170 |
paddw m7, m5 ; r7 = c5*i5 |

171 |
movq m5, m0 ; r5 = c1 |

172 |
pmulhw m0, m3 ; r0 = c1*i1 - i1 |

173 |
paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5 |

174 |
pmulhw m5, m1 ; r5 = c1*i7 - i7 |

175 |
movq m7, C(7) |

176 |
psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3 |

177 |
paddw m0, m3 ; r0 = c1*i1 |

178 |
pmulhw m3, m7 ; r3 = c7*i1 |

179 |
movq m2, I(2) |

180 |
pmulhw m7, m1 ; r7 = c7*i7 |

181 |
paddw m5, m1 ; r5 = c1*i7 |

182 |
movq m1, m2 ; r1 = i2 |

183 |
pmulhw m2, C(2) ; r2 = c2*i2 - i2 |

184 |
psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7 |

185 |
movq m5, J(6) |

186 |
paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7 |

187 |
movq m7, m5 ; r7 = i6 |

188 |
psubsw m0, m4 ; r0 = A - C |

189 |
pmulhw m5, C(2) ; r5 = c2*i6 - i6 |

190 |
paddw m2, m1 ; r2 = c2*i2 |

191 |
pmulhw m1, C(6) ; r1 = c6*i2 |

192 |
paddsw m4, m4 ; r4 = C + C |

193 |
paddsw m4, m0 ; r4 = C. = A + C |

194 |
psubsw m3, m6 ; r3 = B - D |

195 |
paddw m5, m7 ; r5 = c2*i6 |

196 |
paddsw m6, m6 ; r6 = D + D |

197 |
pmulhw m7, C(6) ; r7 = c6*i6 |

198 |
paddsw m6, m3 ; r6 = D. = B + D |

199 |
movq I(1), m4 ; save C. at I(1) |

200 |
psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6 |

201 |
movq m4, C(4) |

202 |
movq m5, m3 ; r5 = B - D |

203 |
pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D) |

204 |
paddsw m7, m2 ; r3 = (c4 - 1) * (B - D) |

205 |
movq I(2), m6 ; save D. at I(2) |

206 |
movq m2, m0 ; r2 = A - C |

207 |
movq m6, I(0) |

208 |
pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C) |

209 |
paddw m5, m3 ; r5 = B. = c4 * (B - D) |

210 |
movq m3, J(4) |

211 |
psubsw m5, m1 ; r5 = B.. = B. - H |

212 |
paddw m2, m0 ; r0 = A. = c4 * (A - C) |

213 |
psubsw m6, m3 ; r6 = i0 - i4 |

214 |
movq m0, m6 |

215 |
pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4) |

216 |
paddsw m3, m3 ; r3 = i4 + i4 |

217 |
paddsw m1, m1 ; r1 = H + H |

218 |
paddsw m3, m0 ; r3 = i0 + i4 |

219 |
paddsw m1, m5 ; r1 = H. = B + H |

220 |
pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4) |

221 |
paddsw m6, m0 ; r6 = F = c4 * (i0 - i4) |

222 |
psubsw m6, m2 ; r6 = F. = F - A. |

223 |
paddsw m2, m2 ; r2 = A. + A. |

224 |
movq m0, I(1) ; r0 = C. |

225 |
paddsw m2, m6 ; r2 = A.. = F + A. |

226 |
paddw m4, m3 ; r4 = E = c4 * (i0 + i4) |

227 |
psubsw m2, m1 ; r2 = R2 = A.. - H. |

228 |
%endmacro |

229 | |

230 |
; RowIDCT gets ready to transpose |

231 |
%macro RowIDCT 0 |

232 |
BeginIDCT |

233 |
movq m3, I(2) ; r3 = D. |

234 |
psubsw m4, m7 ; r4 = E. = E - G |

235 |
paddsw m1, m1 ; r1 = H. + H. |

236 |
paddsw m7, m7 ; r7 = G + G |

237 |
paddsw m1, m2 ; r1 = R1 = A.. + H. |

238 |
paddsw m7, m4 ; r1 = R1 = A.. + H. |

239 |
psubsw m4, m3 ; r4 = R4 = E. - D. |

240 |
paddsw m3, m3 |

241 |
psubsw m6, m5 ; r6 = R6 = F. - B.. |

242 |
paddsw m5, m5 |

243 |
paddsw m3, m4 ; r3 = R3 = E. + D. |

244 |
paddsw m5, m6 ; r5 = R5 = F. + B.. |

245 |
psubsw m7, m0 ; r7 = R7 = G. - C. |

246 |
paddsw m0, m0 |

247 |
movq I(1), m1 ; save R1 |

248 |
paddsw m0, m7 ; r0 = R0 = G. + C. |

249 |
%endmacro |

250 | |

251 |
; Column IDCT normalizes and stores final results |

252 |
%macro ColumnIDCT 0 |

253 |
BeginIDCT |

254 |
paddsw m2, OC_8 ; adjust R2 (and R1) for shift |

255 |
paddsw m1, m1 ; r1 = H. + H. |

256 |
paddsw m1, m2 ; r1 = R1 = A.. + H. |

257 |
psraw m2, 4 ; r2 = NR2 |

258 |
psubsw m4, m7 ; r4 = E. = E - G |

259 |
psraw m1, 4 ; r1 = NR2 |

260 |
movq m3, I(2) ; r3 = D. |

261 |
paddsw m7, m7 ; r7 = G + G |

262 |
movq I(2), m2 ; store NR2 at I2 |

263 |
paddsw m7, m4 ; r7 = G. = E + G |

264 |
movq I(1), m1 ; store NR1 at I1 |

265 |
psubsw m4, m3 ; r4 = R4 = E. - D. |

266 |
paddsw m4, OC_8 ; adjust R4 (and R3) for shift |

267 |
paddsw m3, m3 ; r3 = D. + D. |

268 |
paddsw m3, m4 ; r3 = R3 = E. + D. |

269 |
psraw m4, 4 ; r4 = NR4 |

270 |
psubsw m6, m5 ; r6 = R6 = F. - B.. |

271 |
psraw m3, 4 ; r3 = NR3 |

272 |
paddsw m6, OC_8 ; adjust R6 (and R5) for shift |

273 |
paddsw m5, m5 ; r5 = B.. + B.. |

274 |
paddsw m5, m6 ; r5 = R5 = F. + B.. |

275 |
psraw m6, 4 ; r6 = NR6 |

276 |
movq J(4), m4 ; store NR4 at J4 |

277 |
psraw m5, 4 ; r5 = NR5 |

278 |
movq I(3), m3 ; store NR3 at I3 |

279 |
psubsw m7, m0 ; r7 = R7 = G. - C. |

280 |
paddsw m7, OC_8 ; adjust R7 (and R0) for shift |

281 |
paddsw m0, m0 ; r0 = C. + C. |

282 |
paddsw m0, m7 ; r0 = R0 = G. + C. |

283 |
psraw m7, 4 ; r7 = NR7 |

284 |
movq J(6), m6 ; store NR6 at J6 |

285 |
psraw m0, 4 ; r0 = NR0 |

286 |
movq J(5), m5 ; store NR5 at J5 |

287 |
movq J(7), m7 ; store NR7 at J7 |

288 |
movq I(0), m0 ; store NR0 at I0 |

289 |
%endmacro |

290 | |

291 |
; Following macro does two 4x4 transposes in place. |

292 |
; |

293 |
; At entry (we assume): |

294 |
; |

295 |
; r0 = a3 a2 a1 a0 |

296 |
; I(1) = b3 b2 b1 b0 |

297 |
; r2 = c3 c2 c1 c0 |

298 |
; r3 = d3 d2 d1 d0 |

299 |
; |

300 |
; r4 = e3 e2 e1 e0 |

301 |
; r5 = f3 f2 f1 f0 |

302 |
; r6 = g3 g2 g1 g0 |

303 |
; r7 = h3 h2 h1 h0 |

304 |
; |

305 |
; At exit, we have: |

306 |
; |

307 |
; I(0) = d0 c0 b0 a0 |

308 |
; I(1) = d1 c1 b1 a1 |

309 |
; I(2) = d2 c2 b2 a2 |

310 |
; I(3) = d3 c3 b3 a3 |

311 |
; |

312 |
; J(4) = h0 g0 f0 e0 |

313 |
; J(5) = h1 g1 f1 e1 |

314 |
; J(6) = h2 g2 f2 e2 |

315 |
; J(7) = h3 g3 f3 e3 |

316 |
; |

317 |
; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. |

318 |
; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. |

319 |
; |

320 |
; Since r1 is free at entry, we calculate the Js first. |

321 |
%macro Transpose 0 |

322 |
movq m1, m4 ; r1 = e3 e2 e1 e0 |

323 |
punpcklwd m4, m5 ; r4 = f1 e1 f0 e0 |

324 |
movq I(0), m0 ; save a3 a2 a1 a0 |

325 |
punpckhwd m1, m5 ; r1 = f3 e3 f2 e2 |

326 |
movq m0, m6 ; r0 = g3 g2 g1 g0 |

327 |
punpcklwd m6, m7 ; r6 = h1 g1 h0 g0 |

328 |
movq m5, m4 ; r5 = f1 e1 f0 e0 |

329 |
punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4 |

330 |
punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5 |

331 |
movq m6, m1 ; r6 = f3 e3 f2 e2 |

332 |
movq J(4), m4 |

333 |
punpckhwd m0, m7 ; r0 = h3 g3 h2 g2 |

334 |
movq J(5), m5 |

335 |
punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7 |

336 |
movq m4, I(0) ; r4 = a3 a2 a1 a0 |

337 |
punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6 |

338 |
movq m5, I(1) ; r5 = b3 b2 b1 b0 |

339 |
movq m0, m4 ; r0 = a3 a2 a1 a0 |

340 |
movq J(7), m6 |

341 |
punpcklwd m0, m5 ; r0 = b1 a1 b0 a0 |

342 |
movq J(6), m1 |

343 |
punpckhwd m4, m5 ; r4 = b3 a3 b2 a2 |

344 |
movq m5, m2 ; r5 = c3 c2 c1 c0 |

345 |
punpcklwd m2, m3 ; r2 = d1 c1 d0 c0 |

346 |
movq m1, m0 ; r1 = b1 a1 b0 a0 |

347 |
punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0 |

348 |
punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1 |

349 |
movq m2, m4 ; r2 = b3 a3 b2 a2 |

350 |
movq I(0), m0 |

351 |
punpckhwd m5, m3 ; r5 = d3 c3 d2 c2 |

352 |
movq I(1), m1 |

353 |
punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3 |

354 |
punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2 |

355 |
movq I(3), m4 |

356 |
movq I(2), m2 |

357 |
%endmacro |

358 | |

359 |
%macro VP3_IDCT_mmx 1 |

360 |
; eax = quantized input |

361 |
; ebx = dequantizer matrix |

362 |
; ecx = IDCT constants |

363 |
; M(I) = ecx + MaskOffset(0) + I * 8 |

364 |
; C(I) = ecx + CosineOffset(32) + (I-1) * 8 |

365 |
; edx = output |

366 |
; r0..r7 = mm0..mm7 |

367 |
%define OC_8 [pw_8] |

368 |
%define C(x) [vp3_idct_data+16*(x-1)] |

369 | |

370 |
; at this point, function has completed dequantization + dezigzag + |

371 |
; partial transposition; now do the idct itself |

372 |
%define I(x) [%1+16* x ] |

373 |
%define J(x) [%1+16*(x-4)+8] |

374 |
RowIDCT |

375 |
Transpose |

376 | |

377 |
%define I(x) [%1+16* x +64] |

378 |
%define J(x) [%1+16*(x-4)+72] |

379 |
RowIDCT |

380 |
Transpose |

381 | |

382 |
%define I(x) [%1+16*x] |

383 |
%define J(x) [%1+16*x] |

384 |
ColumnIDCT |

385 | |

386 |
%define I(x) [%1+16*x+8] |

387 |
%define J(x) [%1+16*x+8] |

388 |
ColumnIDCT |

389 |
%endmacro |

390 | |

391 |
%macro VP3_1D_IDCT_SSE2 0 |

392 |
movdqa m2, I(3) ; xmm2 = i3 |

393 |
movdqa m6, C(3) ; xmm6 = c3 |

394 |
movdqa m4, m2 ; xmm4 = i3 |

395 |
movdqa m7, I(5) ; xmm7 = i5 |

396 |
pmulhw m4, m6 ; xmm4 = c3 * i3 - i3 |

397 |
movdqa m1, C(5) ; xmm1 = c5 |

398 |
pmulhw m6, m7 ; xmm6 = c3 * i5 - i5 |

399 |
movdqa m5, m1 ; xmm5 = c5 |

400 |
pmulhw m1, m2 ; xmm1 = c5 * i3 - i3 |

401 |
movdqa m3, I(1) ; xmm3 = i1 |

402 |
pmulhw m5, m7 ; xmm5 = c5 * i5 - i5 |

403 |
movdqa m0, C(1) ; xmm0 = c1 |

404 |
paddw m4, m2 ; xmm4 = c3 * i3 |

405 |
paddw m6, m7 ; xmm6 = c3 * i5 |

406 |
paddw m2, m1 ; xmm2 = c5 * i3 |

407 |
movdqa m1, I(7) ; xmm1 = i7 |

408 |
paddw m7, m5 ; xmm7 = c5 * i5 |

409 |
movdqa m5, m0 ; xmm5 = c1 |

410 |
pmulhw m0, m3 ; xmm0 = c1 * i1 - i1 |

411 |
paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C |

412 |
pmulhw m5, m1 ; xmm5 = c1 * i7 - i7 |

413 |
movdqa m7, C(7) ; xmm7 = c7 |

414 |
psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D |

415 |
paddw m0, m3 ; xmm0 = c1 * i1 |

416 |
pmulhw m3, m7 ; xmm3 = c7 * i1 |

417 |
movdqa m2, I(2) ; xmm2 = i2 |

418 |
pmulhw m7, m1 ; xmm7 = c7 * i7 |

419 |
paddw m5, m1 ; xmm5 = c1 * i7 |

420 |
movdqa m1, m2 ; xmm1 = i2 |

421 |
pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2 |

422 |
psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B |

423 |
movdqa m5, I(6) ; xmm5 = i6 |

424 |
paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A |

425 |
movdqa m7, m5 ; xmm7 = i6 |

426 |
psubsw m0, m4 ; xmm0 = A - C |

427 |
pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6 |

428 |
paddw m2, m1 ; xmm2 = i2 * c2 |

429 |
pmulhw m1, C(6) ; xmm1 = c6 * i2 |

430 |
paddsw m4, m4 ; xmm4 = C + C |

431 |
paddsw m4, m0 ; xmm4 = A + C = C. |

432 |
psubsw m3, m6 ; xmm3 = B - D |

433 |
paddw m5, m7 ; xmm5 = c2 * i6 |

434 |
paddsw m6, m6 ; xmm6 = D + D |

435 |
pmulhw m7, C(6) ; xmm7 = c6 * i6 |

436 |
paddsw m6, m3 ; xmm6 = B + D = D. |

437 |
movdqa I(1), m4 ; Save C. at I(1) |

438 |
psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H |

439 |
movdqa m4, C(4) ; xmm4 = C4 |

440 |
movdqa m5, m3 ; xmm5 = B - D |

441 |
pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D ) |

442 |
paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G |

443 |
movdqa I(2), m6 ; save D. at I(2) |

444 |
movdqa m2, m0 ; xmm2 = A - C |

445 |
movdqa m6, I(0) ; xmm6 = i0 |

446 |
pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A. |

447 |
paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B. |

448 |
movdqa m3, I(4) ; xmm3 = i4 |

449 |
psubsw m5, m1 ; xmm5 = B. - H = B.. |

450 |
paddw m2, m0 ; xmm2 = c4 * ( A - C) = A. |

451 |
psubsw m6, m3 ; xmm6 = i0 - i4 |

452 |
movdqa m0, m6 ; xmm0 = i0 - i4 |

453 |
pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F |

454 |
paddsw m3, m3 ; xmm3 = i4 + i4 |

455 |
paddsw m1, m1 ; xmm1 = H + H |

456 |
paddsw m3, m0 ; xmm3 = i0 + i4 |

457 |
paddsw m1, m5 ; xmm1 = B. + H = H. |

458 |
pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 ) |

459 |
paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 ) |

460 |
psubsw m6, m2 ; xmm6 = F - A. = F. |

461 |
paddsw m2, m2 ; xmm2 = A. + A. |

462 |
movdqa m0, I(1) ; Load C. from I(1) |

463 |
paddsw m2, m6 ; xmm2 = F + A. = A.. |

464 |
paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3 |

465 |
psubsw m2, m1 ; xmm2 = A.. - H. = R2 |

466 |
ADD(m2) ; Adjust R2 and R1 before shifting |

467 |
paddsw m1, m1 ; xmm1 = H. + H. |

468 |
paddsw m1, m2 ; xmm1 = A.. + H. = R1 |

469 |
SHIFT(m2) ; xmm2 = op2 |

470 |
psubsw m4, m7 ; xmm4 = E - G = E. |

471 |
SHIFT(m1) ; xmm1 = op1 |

472 |
movdqa m3, I(2) ; Load D. from I(2) |

473 |
paddsw m7, m7 ; xmm7 = G + G |

474 |
paddsw m7, m4 ; xmm7 = E + G = G. |

475 |
psubsw m4, m3 ; xmm4 = E. - D. = R4 |

476 |
ADD(m4) ; Adjust R4 and R3 before shifting |

477 |
paddsw m3, m3 ; xmm3 = D. + D. |

478 |
paddsw m3, m4 ; xmm3 = E. + D. = R3 |

479 |
SHIFT(m4) ; xmm4 = op4 |

480 |
psubsw m6, m5 ; xmm6 = F. - B..= R6 |

481 |
SHIFT(m3) ; xmm3 = op3 |

482 |
ADD(m6) ; Adjust R6 and R5 before shifting |

483 |
paddsw m5, m5 ; xmm5 = B.. + B.. |

484 |
paddsw m5, m6 ; xmm5 = F. + B.. = R5 |

485 |
SHIFT(m6) ; xmm6 = op6 |

486 |
SHIFT(m5) ; xmm5 = op5 |

487 |
psubsw m7, m0 ; xmm7 = G. - C. = R7 |

488 |
ADD(m7) ; Adjust R7 and R0 before shifting |

489 |
paddsw m0, m0 ; xmm0 = C. + C. |

490 |
paddsw m0, m7 ; xmm0 = G. + C. |

491 |
SHIFT(m7) ; xmm7 = op7 |

492 |
SHIFT(m0) ; xmm0 = op0 |

493 |
%endmacro |

494 | |

495 |
%macro PUT_BLOCK 8 |

496 |
movdqa O(0), m%1 |

497 |
movdqa O(1), m%2 |

498 |
movdqa O(2), m%3 |

499 |
movdqa O(3), m%4 |

500 |
movdqa O(4), m%5 |

501 |
movdqa O(5), m%6 |

502 |
movdqa O(6), m%7 |

503 |
movdqa O(7), m%8 |

504 |
%endmacro |

505 | |

506 |
%macro VP3_IDCT_sse2 1 |

507 |
%define I(x) [%1+16*x] |

508 |
%define O(x) [%1+16*x] |

509 |
%define C(x) [vp3_idct_data+16*(x-1)] |

510 |
%define SHIFT(x) |

511 |
%define ADD(x) |

512 |
VP3_1D_IDCT_SSE2 |

513 |
%ifdef ARCH_X86_64 |

514 |
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 |

515 |
%else |

516 |
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16] |

517 |
%endif |

518 |
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 |

519 | |

520 |
%define SHIFT(x) psraw x, 4 |

521 |
%define ADD(x) paddsw x, [pw_8] |

522 |
VP3_1D_IDCT_SSE2 |

523 |
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 |

524 |
%endmacro |

525 | |

526 |
%macro vp3_idct_funcs 3 |

527 |
cglobal vp3_idct_%1, 1, 1, %2 |

528 |
VP3_IDCT_%1 r0 |

529 |
RET |

530 | |

531 |
cglobal vp3_idct_put_%1, 3, %3, %2 |

532 |
VP3_IDCT_%1 r2 |

533 |
%ifdef ARCH_X86_64 |

534 |
mov r3, r2 |

535 |
mov r2, r1 |

536 |
mov r1, r0 |

537 |
mov r0, r3 |

538 |
%else |

539 |
mov r0m, r2 |

540 |
mov r1m, r0 |

541 |
mov r2m, r1 |

542 |
%endif |

543 |
%ifdef WIN64 |

544 |
call put_signed_pixels_clamped_mmx |

545 |
RET |

546 |
%else |

547 |
jmp put_signed_pixels_clamped_mmx |

548 |
%endif |

549 | |

550 |
cglobal vp3_idct_add_%1, 3, %3, %2 |

551 |
VP3_IDCT_%1 r2 |

552 |
%ifdef ARCH_X86_64 |

553 |
mov r3, r2 |

554 |
mov r2, r1 |

555 |
mov r1, r0 |

556 |
mov r0, r3 |

557 |
%else |

558 |
mov r0m, r2 |

559 |
mov r1m, r0 |

560 |
mov r2m, r1 |

561 |
%endif |

562 |
%ifdef WIN64 |

563 |
call add_pixels_clamped_mmx |

564 |
RET |

565 |
%else |

566 |
jmp add_pixels_clamped_mmx |

567 |
%endif |

568 |
%endmacro |

569 | |

570 |
%ifdef ARCH_X86_64 |

571 |
%define REGS 4 |

572 |
%else |

573 |
%define REGS 3 |

574 |
%endif |

575 |
INIT_MMX |

576 |
vp3_idct_funcs mmx, 0, REGS |

577 |
INIT_XMM |

578 |
vp3_idct_funcs sse2, 9, REGS |

579 |
%undef REGS |

580 | |

581 |
%macro DC_ADD 0 |

582 |
movq m2, [r0 ] |

583 |
movq m3, [r0+r1 ] |

584 |
paddusb m2, m0 |

585 |
movq m4, [r0+r1*2] |

586 |
paddusb m3, m0 |

587 |
movq m5, [r0+r3 ] |

588 |
paddusb m4, m0 |

589 |
paddusb m5, m0 |

590 |
psubusb m2, m1 |

591 |
psubusb m3, m1 |

592 |
movq [r0 ], m2 |

593 |
psubusb m4, m1 |

594 |
movq [r0+r1 ], m3 |

595 |
psubusb m5, m1 |

596 |
movq [r0+r1*2], m4 |

597 |
movq [r0+r3 ], m5 |

598 |
%endmacro |

599 | |

600 |
INIT_MMX |

601 |
cglobal vp3_idct_dc_add_mmx2, 3, 4 |

602 |
%ifdef ARCH_X86_64 |

603 |
movsxd r1, r1d |

604 |
%endif |

605 |
lea r3, [r1*3] |

606 |
movsx r2, word [r2] |

607 |
add r2, 15 |

608 |
sar r2, 5 |

609 |
movd m0, r2d |

610 |
pshufw m0, m0, 0x0 |

611 |
pxor m1, m1 |

612 |
psubw m1, m0 |

613 |
packuswb m0, m0 |

614 |
packuswb m1, m1 |

615 |
DC_ADD |

616 |
lea r0, [r0+r1*4] |

617 |
DC_ADD |

618 |
RET |