## ffmpeg / libavcodec / x86 / h264_intrapred.asm @ 80944df7

History | View | Annotate | Download (70.5 KB)

1 | 4af8cdfc | Jason Garrett-Glaser | ;****************************************************************************** |
---|---|---|---|

2 | ;* H.264 intra prediction asm optimizations |
||

3 | ;* Copyright (c) 2010 Jason Garrett-Glaser |
||

4 | 83ff3f72 | Ronald S. Bultje | ;* Copyright (c) 2010 Holger Lubitz |

5 | ;* Copyright (c) 2010 Loren Merritt |
||

6 | ;* Copyright (c) 2010 Ronald S. Bultje |
||

7 | 4af8cdfc | Jason Garrett-Glaser | ;* |

8 | ;* This file is part of FFmpeg. |
||

9 | ;* |
||

10 | ;* FFmpeg is free software; you can redistribute it and/or |
||

11 | ;* modify it under the terms of the GNU Lesser General Public |
||

12 | ;* License as published by the Free Software Foundation; either |
||

13 | ;* version 2.1 of the License, or (at your option) any later version. |
||

14 | ;* |
||

15 | ;* FFmpeg is distributed in the hope that it will be useful, |
||

16 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||

17 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||

18 | ;* Lesser General Public License for more details. |
||

19 | ;* |
||

20 | ;* You should have received a copy of the GNU Lesser General Public |
||

21 | ;* License along with FFmpeg; if not, write to the Free Software |
||

22 | ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||

23 | ;****************************************************************************** |
||

24 | |||

25 | %include "x86inc.asm" |
||

26 | 2e93fd4b | Daniel Kang | %include "x86util.asm" |

27 | 4af8cdfc | Jason Garrett-Glaser | |

28 | SECTION_RODATA |
||

29 | |||

30 | tm_shuf: times 8 db 0x03, 0x80 |
||

31 | bdd93f1b | Daniel Kang | pw_ff00: times 8 dw 0xff00 |

32 | dd68d4db | Ronald S. Bultje | plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1 |

33 | db 1, 2, 3, 4, 5, 6, 7, 8 |
||

34 | plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0 |
||

35 | db 1, 2, 3, 4, 0, 0, 0, 0 |
||

36 | pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7 |
||

37 | pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8 |
||

38 | pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1 |
||

39 | pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4 |
||

40 | 4af8cdfc | Jason Garrett-Glaser | |

41 | SECTION .text |
||

42 | |||

43 | bc14f04b | Jason Garrett-Glaser | cextern pb_1 |

44 | 4af8cdfc | Jason Garrett-Glaser | cextern pb_3 |

45 | 2e93fd4b | Daniel Kang | cextern pw_4 |

46 | dd68d4db | Ronald S. Bultje | cextern pw_5 |

47 | abab14ea | Daniel Kang | cextern pw_8 |

48 | dd68d4db | Ronald S. Bultje | cextern pw_16 |

49 | cextern pw_17 |
||

50 | cextern pw_32 |
||

51 | 4af8cdfc | Jason Garrett-Glaser | |

52 | ;----------------------------------------------------------------------------- |
||

53 | ; void pred16x16_vertical(uint8_t *src, int stride) |
||

54 | ;----------------------------------------------------------------------------- |
||

55 | |||

56 | cglobal pred16x16_vertical_mmx, 2,3 |
||

57 | sub r0, r1 |
||

58 | mov r2, 8 |
||

59 | movq mm0, [r0+0] |
||

60 | movq mm1, [r0+8] |
||

61 | .loop: |
||

62 | movq [r0+r1*1+0], mm0 |
||

63 | movq [r0+r1*1+8], mm1 |
||

64 | movq [r0+r1*2+0], mm0 |
||

65 | movq [r0+r1*2+8], mm1 |
||

66 | lea r0, [r0+r1*2] |
||

67 | dec r2 |
||

68 | jg .loop |
||

69 | REP_RET |
||

70 | |||

71 | cglobal pred16x16_vertical_sse, 2,3 |
||

72 | sub r0, r1 |
||

73 | mov r2, 4 |
||

74 | movaps xmm0, [r0] |
||

75 | .loop: |
||

76 | movaps [r0+r1*1], xmm0 |
||

77 | movaps [r0+r1*2], xmm0 |
||

78 | lea r0, [r0+r1*2] |
||

79 | movaps [r0+r1*1], xmm0 |
||

80 | movaps [r0+r1*2], xmm0 |
||

81 | lea r0, [r0+r1*2] |
||

82 | dec r2 |
||

83 | jg .loop |
||

84 | REP_RET |
||

85 | |||

86 | ;----------------------------------------------------------------------------- |
||

87 | ; void pred16x16_horizontal(uint8_t *src, int stride) |
||

88 | ;----------------------------------------------------------------------------- |
||

89 | |||

90 | %macro PRED16x16_H 1 |
||

91 | cglobal pred16x16_horizontal_%1, 2,3 |
||

92 | mov r2, 8 |
||

93 | %ifidn %1, ssse3 |
||

94 | mova m2, [pb_3] |
||

95 | %endif |
||

96 | .loop: |
||

97 | movd m0, [r0+r1*0-4] |
||

98 | movd m1, [r0+r1*1-4] |
||

99 | |||

100 | %ifidn %1, ssse3 |
||

101 | pshufb m0, m2 |
||

102 | pshufb m1, m2 |
||

103 | %else |
||

104 | punpcklbw m0, m0 |
||

105 | punpcklbw m1, m1 |
||

106 | %ifidn %1, mmxext |
||

107 | pshufw m0, m0, 0xff |
||

108 | pshufw m1, m1, 0xff |
||

109 | %else |
||

110 | punpckhwd m0, m0 |
||

111 | punpckhwd m1, m1 |
||

112 | punpckhdq m0, m0 |
||

113 | punpckhdq m1, m1 |
||

114 | %endif |
||

115 | mova [r0+r1*0+8], m0 |
||

116 | mova [r0+r1*1+8], m1 |
||

117 | %endif |
||

118 | |||

119 | mova [r0+r1*0], m0 |
||

120 | mova [r0+r1*1], m1 |
||

121 | lea r0, [r0+r1*2] |
||

122 | dec r2 |
||

123 | jg .loop |
||

124 | REP_RET |
||

125 | %endmacro |
||

126 | |||

127 | INIT_MMX |
||

128 | PRED16x16_H mmx |
||

129 | PRED16x16_H mmxext |
||

130 | INIT_XMM |
||

131 | PRED16x16_H ssse3 |
||

132 | |||

133 | ;----------------------------------------------------------------------------- |
||

134 | ; void pred16x16_dc(uint8_t *src, int stride) |
||

135 | ;----------------------------------------------------------------------------- |
||

136 | |||

137 | 17dc7c7a | Jason Garrett-Glaser | %macro PRED16x16_DC 1 |

138 | 4af8cdfc | Jason Garrett-Glaser | cglobal pred16x16_dc_%1, 2,7 |

139 | mov r4, r0 |
||

140 | sub r0, r1 |
||

141 | pxor mm0, mm0 |
||

142 | pxor mm1, mm1 |
||

143 | psadbw mm0, [r0+0] |
||

144 | psadbw mm1, [r0+8] |
||

145 | dec r0 |
||

146 | movzx r5d, byte [r0+r1*1] |
||

147 | paddw mm0, mm1 |
||

148 | movd r6d, mm0 |
||

149 | lea r0, [r0+r1*2] |
||

150 | %rep 7 |
||

151 | movzx r2d, byte [r0+r1*0] |
||

152 | movzx r3d, byte [r0+r1*1] |
||

153 | add r5d, r2d |
||

154 | add r6d, r3d |
||

155 | lea r0, [r0+r1*2] |
||

156 | %endrep |
||

157 | movzx r2d, byte [r0+r1*0] |
||

158 | add r5d, r6d |
||

159 | lea r2d, [r2+r5+16] |
||

160 | shr r2d, 5 |
||

161 | 270a85d2 | Jason Garrett-Glaser | %ifidn %1, mmxext |

162 | 4af8cdfc | Jason Garrett-Glaser | movd m0, r2d |

163 | punpcklbw m0, m0 |
||

164 | pshufw m0, m0, 0 |
||

165 | %elifidn %1, sse2 |
||

166 | movd m0, r2d |
||

167 | punpcklbw m0, m0 |
||

168 | pshuflw m0, m0, 0 |
||

169 | punpcklqdq m0, m0 |
||

170 | %elifidn %1, ssse3 |
||

171 | pxor m1, m1 |
||

172 | movd m0, r2d |
||

173 | pshufb m0, m1 |
||

174 | %endif |
||

175 | |||

176 | %if mmsize==8 |
||

177 | mov r3d, 8 |
||

178 | .loop: |
||

179 | 17dc7c7a | Jason Garrett-Glaser | mova [r4+r1*0+0], m0 |

180 | mova [r4+r1*0+8], m0 |
||

181 | mova [r4+r1*1+0], m0 |
||

182 | mova [r4+r1*1+8], m0 |
||

183 | 4af8cdfc | Jason Garrett-Glaser | %else |

184 | mov r3d, 4 |
||

185 | .loop: |
||

186 | 17dc7c7a | Jason Garrett-Glaser | mova [r4+r1*0], m0 |

187 | mova [r4+r1*1], m0 |
||

188 | 4af8cdfc | Jason Garrett-Glaser | lea r4, [r4+r1*2] |

189 | 17dc7c7a | Jason Garrett-Glaser | mova [r4+r1*0], m0 |

190 | mova [r4+r1*1], m0 |
||

191 | 4af8cdfc | Jason Garrett-Glaser | %endif |

192 | lea r4, [r4+r1*2] |
||

193 | dec r3d |
||

194 | jg .loop |
||

195 | REP_RET |
||

196 | %endmacro |
||

197 | |||

198 | INIT_MMX |
||

199 | 17dc7c7a | Jason Garrett-Glaser | PRED16x16_DC mmxext |

200 | 4af8cdfc | Jason Garrett-Glaser | INIT_XMM |

201 | 17dc7c7a | Jason Garrett-Glaser | PRED16x16_DC sse2 |

202 | PRED16x16_DC ssse3 |
||

203 | 4af8cdfc | Jason Garrett-Glaser | |

204 | ;----------------------------------------------------------------------------- |
||

205 | ; void pred16x16_tm_vp8(uint8_t *src, int stride) |
||

206 | ;----------------------------------------------------------------------------- |
||

207 | |||

208 | %macro PRED16x16_TM_MMX 1 |
||

209 | cglobal pred16x16_tm_vp8_%1, 2,5 |
||

210 | sub r0, r1 |
||

211 | pxor mm7, mm7 |
||

212 | movq mm0, [r0+0] |
||

213 | movq mm2, [r0+8] |
||

214 | movq mm1, mm0 |
||

215 | movq mm3, mm2 |
||

216 | punpcklbw mm0, mm7 |
||

217 | punpckhbw mm1, mm7 |
||

218 | punpcklbw mm2, mm7 |
||

219 | punpckhbw mm3, mm7 |
||

220 | movzx r3d, byte [r0-1] |
||

221 | mov r4d, 16 |
||

222 | .loop: |
||

223 | movzx r2d, byte [r0+r1-1] |
||

224 | sub r2d, r3d |
||

225 | movd mm4, r2d |
||

226 | %ifidn %1, mmx |
||

227 | punpcklwd mm4, mm4 |
||

228 | punpckldq mm4, mm4 |
||

229 | %else |
||

230 | pshufw mm4, mm4, 0 |
||

231 | %endif |
||

232 | movq mm5, mm4 |
||

233 | movq mm6, mm4 |
||

234 | movq mm7, mm4 |
||

235 | paddw mm4, mm0 |
||

236 | paddw mm5, mm1 |
||

237 | paddw mm6, mm2 |
||

238 | paddw mm7, mm3 |
||

239 | packuswb mm4, mm5 |
||

240 | packuswb mm6, mm7 |
||

241 | movq [r0+r1+0], mm4 |
||

242 | movq [r0+r1+8], mm6 |
||

243 | add r0, r1 |
||

244 | dec r4d |
||

245 | jg .loop |
||

246 | REP_RET |
||

247 | %endmacro |
||

248 | |||

249 | PRED16x16_TM_MMX mmx |
||

250 | PRED16x16_TM_MMX mmxext |
||

251 | |||

252 | cglobal pred16x16_tm_vp8_sse2, 2,6,6 |
||

253 | sub r0, r1 |
||

254 | pxor xmm2, xmm2 |
||

255 | movdqa xmm0, [r0] |
||

256 | movdqa xmm1, xmm0 |
||

257 | punpcklbw xmm0, xmm2 |
||

258 | punpckhbw xmm1, xmm2 |
||

259 | movzx r4d, byte [r0-1] |
||

260 | mov r5d, 8 |
||

261 | .loop: |
||

262 | movzx r2d, byte [r0+r1*1-1] |
||

263 | movzx r3d, byte [r0+r1*2-1] |
||

264 | sub r2d, r4d |
||

265 | sub r3d, r4d |
||

266 | movd xmm2, r2d |
||

267 | movd xmm4, r3d |
||

268 | pshuflw xmm2, xmm2, 0 |
||

269 | pshuflw xmm4, xmm4, 0 |
||

270 | punpcklqdq xmm2, xmm2 |
||

271 | punpcklqdq xmm4, xmm4 |
||

272 | movdqa xmm3, xmm2 |
||

273 | movdqa xmm5, xmm4 |
||

274 | paddw xmm2, xmm0 |
||

275 | paddw xmm3, xmm1 |
||

276 | paddw xmm4, xmm0 |
||

277 | paddw xmm5, xmm1 |
||

278 | packuswb xmm2, xmm3 |
||

279 | packuswb xmm4, xmm5 |
||

280 | movdqa [r0+r1*1], xmm2 |
||

281 | movdqa [r0+r1*2], xmm4 |
||

282 | lea r0, [r0+r1*2] |
||

283 | dec r5d |
||

284 | jg .loop |
||

285 | REP_RET |
||

286 | |||

287 | ;----------------------------------------------------------------------------- |
||

288 | dd68d4db | Ronald S. Bultje | ; void pred16x16_plane(uint8_t *src, int stride) |

289 | ;----------------------------------------------------------------------------- |
||

290 | |||

291 | %macro H264_PRED16x16_PLANE 3 |
||

292 | cglobal pred16x16_plane_%3_%1, 2, 7, %2 |
||

293 | mov r2, r1 ; +stride |
||

294 | neg r1 ; -stride |
||

295 | |||

296 | movh m0, [r0+r1 -1] |
||

297 | %if mmsize == 8 |
||

298 | pxor m4, m4 |
||

299 | movh m1, [r0+r1 +3 ] |
||

300 | movh m2, [r0+r1 +8 ] |
||

301 | movh m3, [r0+r1 +12] |
||

302 | punpcklbw m0, m4 |
||

303 | punpcklbw m1, m4 |
||

304 | punpcklbw m2, m4 |
||

305 | punpcklbw m3, m4 |
||

306 | pmullw m0, [pw_m8tom1 ] |
||

307 | pmullw m1, [pw_m8tom1+8] |
||

308 | pmullw m2, [pw_1to8 ] |
||

309 | pmullw m3, [pw_1to8 +8] |
||

310 | paddw m0, m2 |
||

311 | paddw m1, m3 |
||

312 | %else ; mmsize == 16 |
||

313 | %ifidn %1, sse2 |
||

314 | pxor m2, m2 |
||

315 | movh m1, [r0+r1 +8] |
||

316 | punpcklbw m0, m2 |
||

317 | punpcklbw m1, m2 |
||

318 | pmullw m0, [pw_m8tom1] |
||

319 | pmullw m1, [pw_1to8] |
||

320 | paddw m0, m1 |
||

321 | %else ; ssse3 |
||

322 | movhps m0, [r0+r1 +8] |
||

323 | pmaddubsw m0, [plane_shuf] ; H coefficients |
||

324 | %endif |
||

325 | movhlps m1, m0 |
||

326 | %endif |
||

327 | paddw m0, m1 |
||

328 | %ifidn %1, mmx |
||

329 | mova m1, m0 |
||

330 | psrlq m1, 32 |
||

331 | %elifidn %1, mmx2 |
||

332 | pshufw m1, m0, 0xE |
||

333 | %else ; mmsize == 16 |
||

334 | pshuflw m1, m0, 0xE |
||

335 | %endif |
||

336 | paddw m0, m1 |
||

337 | %ifidn %1, mmx |
||

338 | mova m1, m0 |
||

339 | psrlq m1, 16 |
||

340 | %elifidn %1, mmx2 |
||

341 | pshufw m1, m0, 0x1 |
||

342 | %else |
||

343 | pshuflw m1, m0, 0x1 |
||

344 | %endif |
||

345 | paddw m0, m1 ; sum of H coefficients |
||

346 | |||

347 | lea r4, [r0+r2*8-1] |
||

348 | lea r3, [r0+r2*4-1] |
||

349 | add r4, r2 |
||

350 | |||

351 | %ifdef ARCH_X86_64 |
||

352 | %define e_reg r11 |
||

353 | %else |
||

354 | %define e_reg r0 |
||

355 | %endif |
||

356 | |||

357 | movzx e_reg, byte [r3+r2*2 ] |
||

358 | movzx r5, byte [r4+r1 ] |
||

359 | sub r5, e_reg |
||

360 | |||

361 | movzx e_reg, byte [r3+r2 ] |
||

362 | movzx r6, byte [r4 ] |
||

363 | sub r6, e_reg |
||

364 | lea r5, [r5+r6*2] |
||

365 | |||

366 | movzx e_reg, byte [r3+r1 ] |
||

367 | movzx r6, byte [r4+r2*2 ] |
||

368 | sub r6, e_reg |
||

369 | lea r5, [r5+r6*4] |
||

370 | |||

371 | movzx e_reg, byte [r3 ] |
||

372 | %ifdef ARCH_X86_64 |
||

373 | movzx r10, byte [r4+r2 ] |
||

374 | sub r10, e_reg |
||

375 | %else |
||

376 | movzx r6, byte [r4+r2 ] |
||

377 | sub r6, e_reg |
||

378 | lea r5, [r5+r6*4] |
||

379 | sub r5, r6 |
||

380 | %endif |
||

381 | |||

382 | lea e_reg, [r3+r1*4] |
||

383 | lea r3, [r4+r2*4] |
||

384 | |||

385 | movzx r4, byte [e_reg+r2 ] |
||

386 | movzx r6, byte [r3 ] |
||

387 | sub r6, r4 |
||

388 | %ifdef ARCH_X86_64 |
||

389 | lea r6, [r10+r6*2] |
||

390 | lea r5, [r5+r6*2] |
||

391 | add r5, r6 |
||

392 | %else |
||

393 | lea r5, [r5+r6*4] |
||

394 | lea r5, [r5+r6*2] |
||

395 | %endif |
||

396 | |||

397 | movzx r4, byte [e_reg ] |
||

398 | %ifdef ARCH_X86_64 |
||

399 | movzx r10, byte [r3 +r2 ] |
||

400 | sub r10, r4 |
||

401 | sub r5, r10 |
||

402 | %else |
||

403 | movzx r6, byte [r3 +r2 ] |
||

404 | sub r6, r4 |
||

405 | lea r5, [r5+r6*8] |
||

406 | sub r5, r6 |
||

407 | %endif |
||

408 | |||

409 | movzx r4, byte [e_reg+r1 ] |
||

410 | movzx r6, byte [r3 +r2*2] |
||

411 | sub r6, r4 |
||

412 | %ifdef ARCH_X86_64 |
||

413 | add r6, r10 |
||

414 | %endif |
||

415 | lea r5, [r5+r6*8] |
||

416 | |||

417 | movzx r4, byte [e_reg+r2*2] |
||

418 | movzx r6, byte [r3 +r1 ] |
||

419 | sub r6, r4 |
||

420 | lea r5, [r5+r6*4] |
||

421 | add r5, r6 ; sum of V coefficients |
||

422 | |||

423 | %ifndef ARCH_X86_64 |
||

424 | mov r0, r0m |
||

425 | %endif |
||

426 | |||

427 | %ifidn %3, h264 |
||

428 | lea r5, [r5*5+32] |
||

429 | sar r5, 6 |
||

430 | %elifidn %3, rv40 |
||

431 | lea r5, [r5*5] |
||

432 | sar r5, 6 |
||

433 | %elifidn %3, svq3 |
||

434 | test r5, r5 |
||

435 | lea r6, [r5+3] |
||

436 | cmovs r5, r6 |
||

437 | sar r5, 2 ; V/4 |
||

438 | lea r5, [r5*5] ; 5*(V/4) |
||

439 | test r5, r5 |
||

440 | lea r6, [r5+15] |
||

441 | cmovs r5, r6 |
||

442 | sar r5, 4 ; (5*(V/4))/16 |
||

443 | %endif |
||

444 | |||

445 | movzx r4, byte [r0+r1 +15] |
||

446 | movzx r3, byte [r3+r2*2 ] |
||

447 | lea r3, [r3+r4+1] |
||

448 | shl r3, 4 |
||

449 | 1b3e43e4 | Ronald S. Bultje | |

450 | dd68d4db | Ronald S. Bultje | movd r1d, m0 |

451 | movsx r1d, r1w |
||

452 | 1b3e43e4 | Ronald S. Bultje | %ifnidn %3, svq3 |

453 | %ifidn %3, h264 |
||

454 | lea r1d, [r1d*5+32] |
||

455 | %else ; rv40 |
||

456 | lea r1d, [r1d*5] |
||

457 | %endif |
||

458 | sar r1d, 6 |
||

459 | %else ; svq3 |
||

460 | test r1d, r1d |
||

461 | lea r4d, [r1d+3] |
||

462 | cmovs r1d, r4d |
||

463 | sar r1d, 2 ; H/4 |
||

464 | lea r1d, [r1d*5] ; 5*(H/4) |
||

465 | test r1d, r1d |
||

466 | lea r4d, [r1d+15] |
||

467 | cmovs r1d, r4d |
||

468 | sar r1d, 4 ; (5*(H/4))/16 |
||

469 | %endif |
||

470 | movd m0, r1d |
||

471 | |||

472 | dd68d4db | Ronald S. Bultje | add r1d, r5d |

473 | add r3d, r1d |
||

474 | shl r1d, 3 |
||

475 | sub r3d, r1d ; a |
||

476 | |||

477 | movd m1, r5d |
||

478 | movd m3, r3d |
||

479 | %ifidn %1, mmx |
||

480 | punpcklwd m0, m0 |
||

481 | punpcklwd m1, m1 |
||

482 | punpcklwd m3, m3 |
||

483 | punpckldq m0, m0 |
||

484 | punpckldq m1, m1 |
||

485 | punpckldq m3, m3 |
||

486 | %elifidn %1, mmx2 |
||

487 | pshufw m0, m0, 0x0 |
||

488 | pshufw m1, m1, 0x0 |
||

489 | pshufw m3, m3, 0x0 |
||

490 | %else |
||

491 | pshuflw m0, m0, 0x0 |
||

492 | pshuflw m1, m1, 0x0 |
||

493 | pshuflw m3, m3, 0x0 |
||

494 | punpcklqdq m0, m0 ; splat H (words) |
||

495 | punpcklqdq m1, m1 ; splat V (words) |
||

496 | punpcklqdq m3, m3 ; splat a (words) |
||

497 | %endif |
||

498 | %ifidn %3, svq3 |
||

499 | SWAP 0, 1 |
||

500 | %endif |
||

501 | mova m2, m0 |
||

502 | %if mmsize == 8 |
||

503 | mova m5, m0 |
||

504 | %endif |
||

505 | pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) |
||

506 | %if mmsize == 16 |
||

507 | psllw m2, 3 |
||

508 | %else |
||

509 | psllw m5, 3 |
||

510 | psllw m2, 2 |
||

511 | mova m6, m5 |
||

512 | paddw m6, m2 |
||

513 | %endif |
||

514 | paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H |
||

515 | paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H |
||

516 | %if mmsize == 8 |
||

517 | paddw m5, m0 ; a + {8,9,10,11}*H |
||

518 | paddw m6, m0 ; a + {12,13,14,15}*H |
||

519 | %endif |
||

520 | |||

521 | mov r4, 8 |
||

522 | .loop |
||

523 | mova m3, m0 ; b[0..7] |
||

524 | mova m4, m2 ; b[8..15] |
||

525 | psraw m3, 5 |
||

526 | psraw m4, 5 |
||

527 | packuswb m3, m4 |
||

528 | mova [r0], m3 |
||

529 | %if mmsize == 8 |
||

530 | mova m3, m5 ; b[8..11] |
||

531 | mova m4, m6 ; b[12..15] |
||

532 | psraw m3, 5 |
||

533 | psraw m4, 5 |
||

534 | packuswb m3, m4 |
||

535 | mova [r0+8], m3 |
||

536 | %endif |
||

537 | paddw m0, m1 |
||

538 | paddw m2, m1 |
||

539 | %if mmsize == 8 |
||

540 | paddw m5, m1 |
||

541 | paddw m6, m1 |
||

542 | %endif |
||

543 | |||

544 | mova m3, m0 ; b[0..7] |
||

545 | mova m4, m2 ; b[8..15] |
||

546 | psraw m3, 5 |
||

547 | psraw m4, 5 |
||

548 | packuswb m3, m4 |
||

549 | mova [r0+r2], m3 |
||

550 | %if mmsize == 8 |
||

551 | mova m3, m5 ; b[8..11] |
||

552 | mova m4, m6 ; b[12..15] |
||

553 | psraw m3, 5 |
||

554 | psraw m4, 5 |
||

555 | packuswb m3, m4 |
||

556 | mova [r0+r2+8], m3 |
||

557 | %endif |
||

558 | paddw m0, m1 |
||

559 | paddw m2, m1 |
||

560 | %if mmsize == 8 |
||

561 | paddw m5, m1 |
||

562 | paddw m6, m1 |
||

563 | %endif |
||

564 | |||

565 | lea r0, [r0+r2*2] |
||

566 | dec r4 |
||

567 | jg .loop |
||

568 | REP_RET |
||

569 | %endmacro |
||

570 | |||

571 | INIT_MMX |
||

572 | H264_PRED16x16_PLANE mmx, 0, h264 |
||

573 | H264_PRED16x16_PLANE mmx, 0, rv40 |
||

574 | H264_PRED16x16_PLANE mmx, 0, svq3 |
||

575 | H264_PRED16x16_PLANE mmx2, 0, h264 |
||

576 | H264_PRED16x16_PLANE mmx2, 0, rv40 |
||

577 | H264_PRED16x16_PLANE mmx2, 0, svq3 |
||

578 | INIT_XMM |
||

579 | H264_PRED16x16_PLANE sse2, 8, h264 |
||

580 | H264_PRED16x16_PLANE sse2, 8, rv40 |
||

581 | H264_PRED16x16_PLANE sse2, 8, svq3 |
||

582 | H264_PRED16x16_PLANE ssse3, 8, h264 |
||

583 | H264_PRED16x16_PLANE ssse3, 8, rv40 |
||

584 | H264_PRED16x16_PLANE ssse3, 8, svq3 |
||

585 | |||

586 | ;----------------------------------------------------------------------------- |
||

587 | ; void pred8x8_plane(uint8_t *src, int stride) |
||

588 | ;----------------------------------------------------------------------------- |
||

589 | |||

590 | %macro H264_PRED8x8_PLANE 2 |
||

591 | cglobal pred8x8_plane_%1, 2, 7, %2 |
||

592 | mov r2, r1 ; +stride |
||

593 | neg r1 ; -stride |
||

594 | |||

595 | movd m0, [r0+r1 -1] |
||

596 | %if mmsize == 8 |
||

597 | pxor m2, m2 |
||

598 | movh m1, [r0+r1 +4 ] |
||

599 | punpcklbw m0, m2 |
||

600 | punpcklbw m1, m2 |
||

601 | pmullw m0, [pw_m4to4] |
||

602 | pmullw m1, [pw_m4to4+8] |
||

603 | %else ; mmsize == 16 |
||

604 | %ifidn %1, sse2 |
||

605 | pxor m2, m2 |
||

606 | movd m1, [r0+r1 +4] |
||

607 | punpckldq m0, m1 |
||

608 | punpcklbw m0, m2 |
||

609 | pmullw m0, [pw_m4to4] |
||

610 | %else ; ssse3 |
||

611 | movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary |
||

612 | pmaddubsw m0, [plane8_shuf] ; H coefficients |
||

613 | %endif |
||

614 | movhlps m1, m0 |
||

615 | %endif |
||

616 | paddw m0, m1 |
||

617 | |||

618 | %ifnidn %1, ssse3 |
||

619 | %ifidn %1, mmx |
||

620 | mova m1, m0 |
||

621 | psrlq m1, 32 |
||

622 | %elifidn %1, mmx2 |
||

623 | pshufw m1, m0, 0xE |
||

624 | %else ; mmsize == 16 |
||

625 | pshuflw m1, m0, 0xE |
||

626 | %endif |
||

627 | paddw m0, m1 |
||

628 | %endif ; !ssse3 |
||

629 | |||

630 | %ifidn %1, mmx |
||

631 | mova m1, m0 |
||

632 | psrlq m1, 16 |
||

633 | %elifidn %1, mmx2 |
||

634 | pshufw m1, m0, 0x1 |
||

635 | %else |
||

636 | pshuflw m1, m0, 0x1 |
||

637 | %endif |
||

638 | paddw m0, m1 ; sum of H coefficients |
||

639 | |||

640 | lea r4, [r0+r2*4-1] |
||

641 | lea r3, [r0 -1] |
||

642 | add r4, r2 |
||

643 | |||

644 | %ifdef ARCH_X86_64 |
||

645 | %define e_reg r11 |
||

646 | %else |
||

647 | %define e_reg r0 |
||

648 | %endif |
||

649 | |||

650 | movzx e_reg, byte [r3+r2*2 ] |
||

651 | movzx r5, byte [r4+r1 ] |
||

652 | sub r5, e_reg |
||

653 | |||

654 | movzx e_reg, byte [r3 ] |
||

655 | %ifdef ARCH_X86_64 |
||

656 | movzx r10, byte [r4+r2 ] |
||

657 | sub r10, e_reg |
||

658 | sub r5, r10 |
||

659 | %else |
||

660 | movzx r6, byte [r4+r2 ] |
||

661 | sub r6, e_reg |
||

662 | lea r5, [r5+r6*4] |
||

663 | sub r5, r6 |
||

664 | %endif |
||

665 | |||

666 | movzx e_reg, byte [r3+r1 ] |
||

667 | movzx r6, byte [r4+r2*2 ] |
||

668 | sub r6, e_reg |
||

669 | %ifdef ARCH_X86_64 |
||

670 | add r6, r10 |
||

671 | %endif |
||

672 | lea r5, [r5+r6*4] |
||

673 | |||

674 | movzx e_reg, byte [r3+r2 ] |
||

675 | movzx r6, byte [r4 ] |
||

676 | sub r6, e_reg |
||

677 | lea r6, [r5+r6*2] |
||

678 | |||

679 | lea r5, [r6*9+16] |
||

680 | lea r5, [r5+r6*8] |
||

681 | sar r5, 5 |
||

682 | |||

683 | %ifndef ARCH_X86_64 |
||

684 | mov r0, r0m |
||

685 | %endif |
||

686 | |||

687 | movzx r3, byte [r4+r2*2 ] |
||

688 | movzx r4, byte [r0+r1 +7] |
||

689 | lea r3, [r3+r4+1] |
||

690 | shl r3, 4 |
||

691 | movd r1d, m0 |
||

692 | movsx r1d, r1w |
||

693 | 80944df7 | Mans Rullgard | imul r1d, 17 |

694 | add r1d, 16 |
||

695 | sar r1d, 5 |
||

696 | movd m0, r1d |
||

697 | dd68d4db | Ronald S. Bultje | add r1d, r5d |

698 | sub r3d, r1d |
||

699 | add r1d, r1d |
||

700 | sub r3d, r1d ; a |
||

701 | |||

702 | movd m1, r5d |
||

703 | movd m3, r3d |
||

704 | %ifidn %1, mmx |
||

705 | punpcklwd m0, m0 |
||

706 | punpcklwd m1, m1 |
||

707 | punpcklwd m3, m3 |
||

708 | punpckldq m0, m0 |
||

709 | punpckldq m1, m1 |
||

710 | punpckldq m3, m3 |
||

711 | %elifidn %1, mmx2 |
||

712 | pshufw m0, m0, 0x0 |
||

713 | pshufw m1, m1, 0x0 |
||

714 | pshufw m3, m3, 0x0 |
||

715 | %else |
||

716 | pshuflw m0, m0, 0x0 |
||

717 | pshuflw m1, m1, 0x0 |
||

718 | pshuflw m3, m3, 0x0 |
||

719 | punpcklqdq m0, m0 ; splat H (words) |
||

720 | punpcklqdq m1, m1 ; splat V (words) |
||

721 | punpcklqdq m3, m3 ; splat a (words) |
||

722 | %endif |
||

723 | %if mmsize == 8 |
||

724 | mova m2, m0 |
||

725 | %endif |
||

726 | pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) |
||

727 | paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H |
||

728 | %if mmsize == 8 |
||

729 | psllw m2, 2 |
||

730 | paddw m2, m0 ; a + {4,5,6,7}*H |
||

731 | %endif |
||

732 | |||

733 | mov r4, 4 |
||

734 | ALIGN 16 |
||

735 | .loop |
||

736 | %if mmsize == 16 |
||

737 | mova m3, m0 ; b[0..7] |
||

738 | paddw m0, m1 |
||

739 | psraw m3, 5 |
||

740 | mova m4, m0 ; V+b[0..7] |
||

741 | paddw m0, m1 |
||

742 | psraw m4, 5 |
||

743 | packuswb m3, m4 |
||

744 | movh [r0], m3 |
||

745 | movhps [r0+r2], m3 |
||

746 | %else ; mmsize == 8 |
||

747 | mova m3, m0 ; b[0..3] |
||

748 | mova m4, m2 ; b[4..7] |
||

749 | paddw m0, m1 |
||

750 | paddw m2, m1 |
||

751 | psraw m3, 5 |
||

752 | psraw m4, 5 |
||

753 | mova m5, m0 ; V+b[0..3] |
||

754 | mova m6, m2 ; V+b[4..7] |
||

755 | paddw m0, m1 |
||

756 | paddw m2, m1 |
||

757 | psraw m5, 5 |
||

758 | psraw m6, 5 |
||

759 | packuswb m3, m4 |
||

760 | packuswb m5, m6 |
||

761 | mova [r0], m3 |
||

762 | mova [r0+r2], m5 |
||

763 | %endif |
||

764 | |||

765 | lea r0, [r0+r2*2] |
||

766 | dec r4 |
||

767 | jg .loop |
||

768 | REP_RET |
||

769 | %endmacro |
||

770 | |||

771 | INIT_MMX |
||

772 | H264_PRED8x8_PLANE mmx, 0 |
||

773 | H264_PRED8x8_PLANE mmx2, 0 |
||

774 | INIT_XMM |
||

775 | H264_PRED8x8_PLANE sse2, 8 |
||

776 | H264_PRED8x8_PLANE ssse3, 8 |
||

777 | |||

778 | ;----------------------------------------------------------------------------- |
||

779 | 4af8cdfc | Jason Garrett-Glaser | ; void pred8x8_vertical(uint8_t *src, int stride) |

780 | ;----------------------------------------------------------------------------- |
||

781 | |||

782 | cglobal pred8x8_vertical_mmx, 2,2 |
||

783 | sub r0, r1 |
||

784 | movq mm0, [r0] |
||

785 | %rep 3 |
||

786 | movq [r0+r1*1], mm0 |
||

787 | movq [r0+r1*2], mm0 |
||

788 | lea r0, [r0+r1*2] |
||

789 | %endrep |
||

790 | movq [r0+r1*1], mm0 |
||

791 | movq [r0+r1*2], mm0 |
||

792 | RET |
||

793 | |||

794 | ;----------------------------------------------------------------------------- |
||

795 | ; void pred8x8_horizontal(uint8_t *src, int stride) |
||

796 | ;----------------------------------------------------------------------------- |
||

797 | |||

798 | %macro PRED8x8_H 1 |
||

799 | cglobal pred8x8_horizontal_%1, 2,3 |
||

800 | mov r2, 4 |
||

801 | %ifidn %1, ssse3 |
||

802 | mova m2, [pb_3] |
||

803 | %endif |
||

804 | .loop: |
||

805 | movd m0, [r0+r1*0-4] |
||

806 | movd m1, [r0+r1*1-4] |
||

807 | %ifidn %1, ssse3 |
||

808 | pshufb m0, m2 |
||

809 | pshufb m1, m2 |
||

810 | %else |
||

811 | punpcklbw m0, m0 |
||

812 | punpcklbw m1, m1 |
||

813 | %ifidn %1, mmxext |
||

814 | pshufw m0, m0, 0xff |
||

815 | pshufw m1, m1, 0xff |
||

816 | %else |
||

817 | punpckhwd m0, m0 |
||

818 | punpckhwd m1, m1 |
||

819 | punpckhdq m0, m0 |
||

820 | punpckhdq m1, m1 |
||

821 | %endif |
||

822 | %endif |
||

823 | mova [r0+r1*0], m0 |
||

824 | mova [r0+r1*1], m1 |
||

825 | lea r0, [r0+r1*2] |
||

826 | dec r2 |
||

827 | jg .loop |
||

828 | REP_RET |
||

829 | %endmacro |
||

830 | |||

831 | INIT_MMX |
||

832 | PRED8x8_H mmx |
||

833 | PRED8x8_H mmxext |
||

834 | PRED8x8_H ssse3 |
||

835 | |||

836 | ;----------------------------------------------------------------------------- |
||

837 | 725a3f9d | Daniel Kang | ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride) |

838 | ;----------------------------------------------------------------------------- |
||

839 | %ifdef CONFIG_GPL |
||

840 | cglobal pred8x8_top_dc_mmxext, 2,5 |
||

841 | sub r0, r1 |
||

842 | movq mm0, [r0] |
||

843 | pxor mm1, mm1 |
||

844 | pxor mm2, mm2 |
||

845 | lea r2, [r0+r1*2] |
||

846 | punpckhbw mm1, mm0 |
||

847 | punpcklbw mm0, mm2 |
||

848 | psadbw mm1, mm2 ; s1 |
||

849 | lea r3, [r2+r1*2] |
||

850 | psadbw mm0, mm2 ; s0 |
||

851 | psrlw mm1, 1 |
||

852 | psrlw mm0, 1 |
||

853 | pavgw mm1, mm2 |
||

854 | lea r4, [r3+r1*2] |
||

855 | pavgw mm0, mm2 |
||

856 | pshufw mm1, mm1, 0 |
||

857 | pshufw mm0, mm0, 0 ; dc0 (w) |
||

858 | packuswb mm0, mm1 ; dc0,dc1 (b) |
||

859 | movq [r0+r1*1], mm0 |
||

860 | movq [r0+r1*2], mm0 |
||

861 | lea r0, [r3+r1*2] |
||

862 | movq [r2+r1*1], mm0 |
||

863 | movq [r2+r1*2], mm0 |
||

864 | movq [r3+r1*1], mm0 |
||

865 | movq [r3+r1*2], mm0 |
||

866 | movq [r0+r1*1], mm0 |
||

867 | movq [r0+r1*2], mm0 |
||

868 | RET |
||

869 | |||

870 | ;----------------------------------------------------------------------------- |
||

871 | a2dfe8d1 | Ronald S. Bultje | ; void pred8x8_dc_mmxext(uint8_t *src, int stride) |

872 | ;----------------------------------------------------------------------------- |
||

873 | e8d98764 | Ronald S. Bultje | |

874 | a2dfe8d1 | Ronald S. Bultje | INIT_MMX |

875 | cglobal pred8x8_dc_mmxext, 2,5 |
||

876 | sub r0, r1 |
||

877 | pxor m7, m7 |
||

878 | movd m0, [r0+0] |
||

879 | movd m1, [r0+4] |
||

880 | psadbw m0, m7 ; s0 |
||

881 | mov r4, r0 |
||

882 | psadbw m1, m7 ; s1 |
||

883 | |||

884 | movzx r2d, byte [r0+r1*1-1] |
||

885 | movzx r3d, byte [r0+r1*2-1] |
||

886 | lea r0, [r0+r1*2] |
||

887 | add r2d, r3d |
||

888 | movzx r3d, byte [r0+r1*1-1] |
||

889 | add r2d, r3d |
||

890 | movzx r3d, byte [r0+r1*2-1] |
||

891 | add r2d, r3d |
||

892 | lea r0, [r0+r1*2] |
||

893 | movd m2, r2d ; s2 |
||

894 | movzx r2d, byte [r0+r1*1-1] |
||

895 | movzx r3d, byte [r0+r1*2-1] |
||

896 | lea r0, [r0+r1*2] |
||

897 | add r2d, r3d |
||

898 | movzx r3d, byte [r0+r1*1-1] |
||

899 | add r2d, r3d |
||

900 | movzx r3d, byte [r0+r1*2-1] |
||

901 | add r2d, r3d |
||

902 | movd m3, r2d ; s3 |
||

903 | |||

904 | punpcklwd m0, m1 |
||

905 | mov r0, r4 |
||

906 | punpcklwd m2, m3 |
||

907 | punpckldq m0, m2 ; s0, s1, s2, s3 |
||

908 | pshufw m3, m0, 11110110b ; s2, s1, s3, s3 |
||

909 | lea r2, [r0+r1*2] |
||

910 | pshufw m0, m0, 01110100b ; s0, s1, s3, s1 |
||

911 | paddw m0, m3 |
||

912 | lea r3, [r2+r1*2] |
||

913 | psrlw m0, 2 |
||

914 | pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 |
||

915 | lea r4, [r3+r1*2] |
||

916 | packuswb m0, m0 |
||

917 | punpcklbw m0, m0 |
||

918 | movq m1, m0 |
||

919 | punpcklbw m0, m0 |
||

920 | punpckhbw m1, m1 |
||

921 | movq [r0+r1*1], m0 |
||

922 | movq [r0+r1*2], m0 |
||

923 | movq [r2+r1*1], m0 |
||

924 | movq [r2+r1*2], m0 |
||

925 | movq [r3+r1*1], m1 |
||

926 | movq [r3+r1*2], m1 |
||

927 | movq [r4+r1*1], m1 |
||

928 | movq [r4+r1*2], m1 |
||

929 | RET |
||

930 | %endif |
||

931 | |||

932 | ;----------------------------------------------------------------------------- |
||

933 | 4af8cdfc | Jason Garrett-Glaser | ; void pred8x8_dc_rv40(uint8_t *src, int stride) |

934 | ;----------------------------------------------------------------------------- |
||

935 | |||

936 | 270a85d2 | Jason Garrett-Glaser | cglobal pred8x8_dc_rv40_mmxext, 2,7 |

937 | 4af8cdfc | Jason Garrett-Glaser | mov r4, r0 |

938 | sub r0, r1 |
||

939 | pxor mm0, mm0 |
||

940 | psadbw mm0, [r0] |
||

941 | dec r0 |
||

942 | movzx r5d, byte [r0+r1*1] |
||

943 | movd r6d, mm0 |
||

944 | lea r0, [r0+r1*2] |
||

945 | %rep 3 |
||

946 | movzx r2d, byte [r0+r1*0] |
||

947 | movzx r3d, byte [r0+r1*1] |
||

948 | add r5d, r2d |
||

949 | add r6d, r3d |
||

950 | lea r0, [r0+r1*2] |
||

951 | %endrep |
||

952 | movzx r2d, byte [r0+r1*0] |
||

953 | add r5d, r6d |
||

954 | lea r2d, [r2+r5+8] |
||

955 | shr r2d, 4 |
||

956 | movd mm0, r2d |
||

957 | punpcklbw mm0, mm0 |
||

958 | pshufw mm0, mm0, 0 |
||

959 | mov r3d, 4 |
||

960 | .loop: |
||

961 | movq [r4+r1*0], mm0 |
||

962 | movq [r4+r1*1], mm0 |
||

963 | lea r4, [r4+r1*2] |
||

964 | dec r3d |
||

965 | jg .loop |
||

966 | REP_RET |
||

967 | |||

968 | ;----------------------------------------------------------------------------- |
||

969 | ; void pred8x8_tm_vp8(uint8_t *src, int stride) |
||

970 | ;----------------------------------------------------------------------------- |
||

971 | |||

972 | %macro PRED8x8_TM_MMX 1 |
||

973 | cglobal pred8x8_tm_vp8_%1, 2,6 |
||

974 | sub r0, r1 |
||

975 | pxor mm7, mm7 |
||

976 | movq mm0, [r0] |
||

977 | movq mm1, mm0 |
||

978 | punpcklbw mm0, mm7 |
||

979 | punpckhbw mm1, mm7 |
||

980 | movzx r4d, byte [r0-1] |
||

981 | mov r5d, 4 |
||

982 | .loop: |
||

983 | movzx r2d, byte [r0+r1*1-1] |
||

984 | movzx r3d, byte [r0+r1*2-1] |
||

985 | sub r2d, r4d |
||

986 | sub r3d, r4d |
||

987 | movd mm2, r2d |
||

988 | movd mm4, r3d |
||

989 | %ifidn %1, mmx |
||

990 | punpcklwd mm2, mm2 |
||

991 | punpcklwd mm4, mm4 |
||

992 | punpckldq mm2, mm2 |
||

993 | punpckldq mm4, mm4 |
||

994 | %else |
||

995 | pshufw mm2, mm2, 0 |
||

996 | pshufw mm4, mm4, 0 |
||

997 | %endif |
||

998 | movq mm3, mm2 |
||

999 | movq mm5, mm4 |
||

1000 | paddw mm2, mm0 |
||

1001 | paddw mm3, mm1 |
||

1002 | paddw mm4, mm0 |
||

1003 | paddw mm5, mm1 |
||

1004 | packuswb mm2, mm3 |
||

1005 | packuswb mm4, mm5 |
||

1006 | movq [r0+r1*1], mm2 |
||

1007 | movq [r0+r1*2], mm4 |
||

1008 | lea r0, [r0+r1*2] |
||

1009 | dec r5d |
||

1010 | jg .loop |
||

1011 | REP_RET |
||

1012 | %endmacro |
||

1013 | |||

1014 | PRED8x8_TM_MMX mmx |
||

1015 | PRED8x8_TM_MMX mmxext |
||

1016 | |||

1017 | cglobal pred8x8_tm_vp8_sse2, 2,6,4 |
||

1018 | sub r0, r1 |
||

1019 | pxor xmm1, xmm1 |
||

1020 | movq xmm0, [r0] |
||

1021 | punpcklbw xmm0, xmm1 |
||

1022 | movzx r4d, byte [r0-1] |
||

1023 | mov r5d, 4 |
||

1024 | .loop: |
||

1025 | movzx r2d, byte [r0+r1*1-1] |
||

1026 | movzx r3d, byte [r0+r1*2-1] |
||

1027 | sub r2d, r4d |
||

1028 | sub r3d, r4d |
||

1029 | movd xmm2, r2d |
||

1030 | movd xmm3, r3d |
||

1031 | pshuflw xmm2, xmm2, 0 |
||

1032 | pshuflw xmm3, xmm3, 0 |
||

1033 | punpcklqdq xmm2, xmm2 |
||

1034 | punpcklqdq xmm3, xmm3 |
||

1035 | paddw xmm2, xmm0 |
||

1036 | paddw xmm3, xmm0 |
||

1037 | packuswb xmm2, xmm3 |
||

1038 | movq [r0+r1*1], xmm2 |
||

1039 | movhps [r0+r1*2], xmm2 |
||

1040 | lea r0, [r0+r1*2] |
||

1041 | dec r5d |
||

1042 | jg .loop |
||

1043 | REP_RET |
||

1044 | |||

1045 | cglobal pred8x8_tm_vp8_ssse3, 2,3,6 |
||

1046 | sub r0, r1 |
||

1047 | movdqa xmm4, [tm_shuf] |
||

1048 | pxor xmm1, xmm1 |
||

1049 | movq xmm0, [r0] |
||

1050 | punpcklbw xmm0, xmm1 |
||

1051 | movd xmm5, [r0-4] |
||

1052 | pshufb xmm5, xmm4 |
||

1053 | mov r2d, 4 |
||

1054 | .loop: |
||

1055 | movd xmm2, [r0+r1*1-4] |
||

1056 | movd xmm3, [r0+r1*2-4] |
||

1057 | pshufb xmm2, xmm4 |
||

1058 | pshufb xmm3, xmm4 |
||

1059 | psubw xmm2, xmm5 |
||

1060 | psubw xmm3, xmm5 |
||

1061 | paddw xmm2, xmm0 |
||

1062 | paddw xmm3, xmm0 |
||

1063 | packuswb xmm2, xmm3 |
||

1064 | movq [r0+r1*1], xmm2 |
||

1065 | movhps [r0+r1*2], xmm2 |
||

1066 | lea r0, [r0+r1*2] |
||

1067 | dec r2d |
||

1068 | jg .loop |
||

1069 | REP_RET |
||

1070 | 270a85d2 | Jason Garrett-Glaser | |

1071 | 54a959e4 | Ronald S. Bultje | ; dest, left, right, src, tmp |

1072 | ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 |
||

1073 | %macro PRED4x4_LOWPASS 5 |
||

1074 | mova %5, %2 |
||

1075 | pavgb %2, %3 |
||

1076 | pxor %3, %5 |
||

1077 | mova %1, %4 |
||

1078 | pand %3, [pb_1] |
||

1079 | psubusb %2, %3 |
||

1080 | pavgb %1, %2 |
||

1081 | %endmacro |
||

1082 | |||

1083 | 8b746bb4 | Jason Garrett-Glaser | ;----------------------------------------------------------------------------- |

1084 | 2e93fd4b | Daniel Kang | ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride) |

1085 | ;----------------------------------------------------------------------------- |
||

1086 | %ifdef CONFIG_GPL |
||

1087 | %macro PRED8x8L_TOP_DC 1 |
||

1088 | cglobal pred8x8l_top_dc_%1, 4,4 |
||

1089 | sub r0, r3 |
||

1090 | pxor mm7, mm7 |
||

1091 | movq mm0, [r0-8] |
||

1092 | movq mm3, [r0] |
||

1093 | movq mm1, [r0+8] |
||

1094 | movq mm2, mm3 |
||

1095 | movq mm4, mm3 |
||

1096 | PALIGNR mm2, mm0, 7, mm0 |
||

1097 | PALIGNR mm1, mm4, 1, mm4 |
||

1098 | test r1, r1 ; top_left |
||

1099 | jz .fix_lt_2 |
||

1100 | test r2, r2 ; top_right |
||

1101 | jz .fix_tr_1 |
||

1102 | jmp .body |
||

1103 | .fix_lt_2: |
||

1104 | movq mm5, mm3 |
||

1105 | pxor mm5, mm2 |
||

1106 | psllq mm5, 56 |
||

1107 | psrlq mm5, 56 |
||

1108 | pxor mm2, mm5 |
||

1109 | test r2, r2 ; top_right |
||

1110 | jnz .body |
||

1111 | .fix_tr_1: |
||

1112 | movq mm5, mm3 |
||

1113 | pxor mm5, mm1 |
||

1114 | psrlq mm5, 56 |
||

1115 | psllq mm5, 56 |
||

1116 | pxor mm1, mm5 |
||

1117 | .body |
||

1118 | PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 |
||

1119 | psadbw mm7, mm0 |
||

1120 | paddw mm7, [pw_4] |
||

1121 | psrlw mm7, 3 |
||

1122 | pshufw mm7, mm7, 0 |
||

1123 | packuswb mm7, mm7 |
||

1124 | %rep 3 |
||

1125 | movq [r0+r3*1], mm7 |
||

1126 | movq [r0+r3*2], mm7 |
||

1127 | lea r0, [r0+r3*2] |
||

1128 | %endrep |
||

1129 | movq [r0+r3*1], mm7 |
||

1130 | movq [r0+r3*2], mm7 |
||

1131 | RET |
||

1132 | %endmacro |
||

1133 | |||

1134 | INIT_MMX |
||

1135 | %define PALIGNR PALIGNR_MMX |
||

1136 | PRED8x8L_TOP_DC mmxext |
||

1137 | %define PALIGNR PALIGNR_SSSE3 |
||

1138 | PRED8x8L_TOP_DC ssse3 |
||

1139 | |||

1140 | ;----------------------------------------------------------------------------- |
||

1141 | abab14ea | Daniel Kang | ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride) |

1142 | ;----------------------------------------------------------------------------- |
||

1143 | e8d98764 | Ronald S. Bultje | |

1144 | abab14ea | Daniel Kang | %macro PRED8x8L_DC 1 |

1145 | cglobal pred8x8l_dc_%1, 4,5 |
||

1146 | sub r0, r3 |
||

1147 | lea r4, [r0+r3*2] |
||

1148 | movq mm0, [r0+r3*1-8] |
||

1149 | punpckhbw mm0, [r0+r3*0-8] |
||

1150 | movq mm1, [r4+r3*1-8] |
||

1151 | punpckhbw mm1, [r0+r3*2-8] |
||

1152 | mov r4, r0 |
||

1153 | punpckhwd mm1, mm0 |
||

1154 | lea r0, [r0+r3*4] |
||

1155 | movq mm2, [r0+r3*1-8] |
||

1156 | punpckhbw mm2, [r0+r3*0-8] |
||

1157 | lea r0, [r0+r3*2] |
||

1158 | movq mm3, [r0+r3*1-8] |
||

1159 | punpckhbw mm3, [r0+r3*0-8] |
||

1160 | punpckhwd mm3, mm2 |
||

1161 | punpckhdq mm3, mm1 |
||

1162 | lea r0, [r0+r3*2] |
||

1163 | movq mm0, [r0+r3*0-8] |
||

1164 | movq mm1, [r4] |
||

1165 | mov r0, r4 |
||

1166 | movq mm4, mm3 |
||

1167 | movq mm2, mm3 |
||

1168 | PALIGNR mm4, mm0, 7, mm0 |
||

1169 | PALIGNR mm1, mm2, 1, mm2 |
||

1170 | test r1, r1 |
||

1171 | jnz .do_left |
||

1172 | .fix_lt_1: |
||

1173 | movq mm5, mm3 |
||

1174 | pxor mm5, mm4 |
||

1175 | psrlq mm5, 56 |
||

1176 | psllq mm5, 48 |
||

1177 | pxor mm1, mm5 |
||

1178 | jmp .do_left |
||

1179 | .fix_lt_2: |
||

1180 | movq mm5, mm3 |
||

1181 | pxor mm5, mm2 |
||

1182 | psllq mm5, 56 |
||

1183 | psrlq mm5, 56 |
||

1184 | pxor mm2, mm5 |
||

1185 | test r2, r2 |
||

1186 | jnz .body |
||

1187 | .fix_tr_1: |
||

1188 | movq mm5, mm3 |
||

1189 | pxor mm5, mm1 |
||

1190 | psrlq mm5, 56 |
||

1191 | psllq mm5, 56 |
||

1192 | pxor mm1, mm5 |
||

1193 | jmp .body |
||

1194 | .do_left: |
||

1195 | movq mm0, mm4 |
||

1196 | PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
||

1197 | movq mm4, mm0 |
||

1198 | movq mm7, mm2 |
||

1199 | PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 |
||

1200 | psllq mm1, 56 |
||

1201 | PALIGNR mm7, mm1, 7, mm3 |
||

1202 | movq mm0, [r0-8] |
||

1203 | movq mm3, [r0] |
||

1204 | movq mm1, [r0+8] |
||

1205 | movq mm2, mm3 |
||

1206 | movq mm4, mm3 |
||

1207 | PALIGNR mm2, mm0, 7, mm0 |
||

1208 | PALIGNR mm1, mm4, 1, mm4 |
||

1209 | test r1, r1 |
||

1210 | jz .fix_lt_2 |
||

1211 | test r2, r2 |
||

1212 | jz .fix_tr_1 |
||

1213 | .body |
||

1214 | lea r1, [r0+r3*2] |
||

1215 | PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 |
||

1216 | pxor mm0, mm0 |
||

1217 | pxor mm1, mm1 |
||

1218 | lea r2, [r1+r3*2] |
||

1219 | psadbw mm0, mm7 |
||

1220 | psadbw mm1, mm6 |
||

1221 | paddw mm0, [pw_8] |
||

1222 | paddw mm0, mm1 |
||

1223 | lea r4, [r2+r3*2] |
||

1224 | psrlw mm0, 4 |
||

1225 | pshufw mm0, mm0, 0 |
||

1226 | packuswb mm0, mm0 |
||

1227 | movq [r0+r3*1], mm0 |
||

1228 | movq [r0+r3*2], mm0 |
||

1229 | movq [r1+r3*1], mm0 |
||

1230 | movq [r1+r3*2], mm0 |
||

1231 | movq [r2+r3*1], mm0 |
||

1232 | movq [r2+r3*2], mm0 |
||

1233 | movq [r4+r3*1], mm0 |
||

1234 | movq [r4+r3*2], mm0 |
||

1235 | RET |
||

1236 | %endmacro |
||

1237 | INIT_MMX |
||

1238 | %define PALIGNR PALIGNR_MMX |
||

1239 | PRED8x8L_DC mmxext |
||

1240 | %define PALIGNR PALIGNR_SSSE3 |
||

1241 | PRED8x8L_DC ssse3 |
||

1242 | |||

1243 | ;----------------------------------------------------------------------------- |
||

1244 | 04207ef3 | Daniel Kang | ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride) |

1245 | ;----------------------------------------------------------------------------- |
||

1246 | e8d98764 | Ronald S. Bultje | |

1247 | 04207ef3 | Daniel Kang | %macro PRED8x8L_HORIZONTAL 1 |

1248 | cglobal pred8x8l_horizontal_%1, 4,4 |
||

1249 | sub r0, r3 |
||

1250 | lea r2, [r0+r3*2] |
||

1251 | movq mm0, [r0+r3*1-8] |
||

1252 | b9c7f66e | Ronald S. Bultje | test r1, r1 |

1253 | lea r1, [r0+r3] |
||

1254 | cmovnz r1, r0 |
||

1255 | punpckhbw mm0, [r1+r3*0-8] |
||

1256 | 04207ef3 | Daniel Kang | movq mm1, [r2+r3*1-8] |

1257 | punpckhbw mm1, [r0+r3*2-8] |
||

1258 | mov r2, r0 |
||

1259 | punpckhwd mm1, mm0 |
||

1260 | lea r0, [r0+r3*4] |
||

1261 | movq mm2, [r0+r3*1-8] |
||

1262 | punpckhbw mm2, [r0+r3*0-8] |
||

1263 | lea r0, [r0+r3*2] |
||

1264 | movq mm3, [r0+r3*1-8] |
||

1265 | punpckhbw mm3, [r0+r3*0-8] |
||

1266 | punpckhwd mm3, mm2 |
||

1267 | punpckhdq mm3, mm1 |
||

1268 | lea r0, [r0+r3*2] |
||

1269 | movq mm0, [r0+r3*0-8] |
||

1270 | b9c7f66e | Ronald S. Bultje | movq mm1, [r1+r3*0-8] |

1271 | 04207ef3 | Daniel Kang | mov r0, r2 |

1272 | movq mm4, mm3 |
||

1273 | movq mm2, mm3 |
||

1274 | PALIGNR mm4, mm0, 7, mm0 |
||

1275 | PALIGNR mm1, mm2, 1, mm2 |
||

1276 | movq mm0, mm4 |
||

1277 | PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
||

1278 | movq mm4, mm0 |
||

1279 | movq mm7, mm2 |
||

1280 | PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 |
||

1281 | psllq mm1, 56 |
||

1282 | PALIGNR mm7, mm1, 7, mm3 |
||

1283 | movq mm3, mm7 |
||

1284 | lea r1, [r0+r3*2] |
||

1285 | movq mm7, mm3 |
||

1286 | punpckhbw mm3, mm3 |
||

1287 | punpcklbw mm7, mm7 |
||

1288 | pshufw mm0, mm3, 0xff |
||

1289 | pshufw mm1, mm3, 0xaa |
||

1290 | lea r2, [r1+r3*2] |
||

1291 | pshufw mm2, mm3, 0x55 |
||

1292 | pshufw mm3, mm3, 0x00 |
||

1293 | pshufw mm4, mm7, 0xff |
||

1294 | pshufw mm5, mm7, 0xaa |
||

1295 | pshufw mm6, mm7, 0x55 |
||

1296 | pshufw mm7, mm7, 0x00 |
||

1297 | movq [r0+r3*1], mm0 |
||

1298 | movq [r0+r3*2], mm1 |
||

1299 | movq [r1+r3*1], mm2 |
||

1300 | movq [r1+r3*2], mm3 |
||

1301 | movq [r2+r3*1], mm4 |
||

1302 | movq [r2+r3*2], mm5 |
||

1303 | lea r0, [r2+r3*2] |
||

1304 | movq [r0+r3*1], mm6 |
||

1305 | movq [r0+r3*2], mm7 |
||

1306 | RET |
||

1307 | %endmacro |
||

1308 | |||

1309 | INIT_MMX |
||

1310 | %define PALIGNR PALIGNR_MMX |
||

1311 | PRED8x8L_HORIZONTAL mmxext |
||

1312 | %define PALIGNR PALIGNR_SSSE3 |
||

1313 | PRED8x8L_HORIZONTAL ssse3 |
||

1314 | |||

1315 | ;----------------------------------------------------------------------------- |
||

1316 | ee1ba9c3 | Daniel Kang | ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride) |

1317 | ;----------------------------------------------------------------------------- |
||

1318 | e8d98764 | Ronald S. Bultje | |

1319 | ee1ba9c3 | Daniel Kang | %macro PRED8x8L_VERTICAL 1 |

1320 | cglobal pred8x8l_vertical_%1, 4,4 |
||

1321 | sub r0, r3 |
||

1322 | movq mm0, [r0-8] |
||

1323 | movq mm3, [r0] |
||

1324 | movq mm1, [r0+8] |
||

1325 | movq mm2, mm3 |
||

1326 | movq mm4, mm3 |
||

1327 | PALIGNR mm2, mm0, 7, mm0 |
||

1328 | PALIGNR mm1, mm4, 1, mm4 |
||

1329 | test r1, r1 ; top_left |
||

1330 | jz .fix_lt_2 |
||

1331 | test r2, r2 ; top_right |
||

1332 | jz .fix_tr_1 |
||

1333 | jmp .body |
||

1334 | .fix_lt_2: |
||

1335 | movq mm5, mm3 |
||

1336 | pxor mm5, mm2 |
||

1337 | psllq mm5, 56 |
||

1338 | psrlq mm5, 56 |
||

1339 | pxor mm2, mm5 |
||

1340 | test r2, r2 ; top_right |
||

1341 | jnz .body |
||

1342 | .fix_tr_1: |
||

1343 | movq mm5, mm3 |
||

1344 | pxor mm5, mm1 |
||

1345 | psrlq mm5, 56 |
||

1346 | psllq mm5, 56 |
||

1347 | pxor mm1, mm5 |
||

1348 | .body |
||

1349 | PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 |
||

1350 | %rep 3 |
||

1351 | movq [r0+r3*1], mm0 |
||

1352 | movq [r0+r3*2], mm0 |
||

1353 | lea r0, [r0+r3*2] |
||

1354 | %endrep |
||

1355 | movq [r0+r3*1], mm0 |
||

1356 | movq [r0+r3*2], mm0 |
||

1357 | RET |
||

1358 | %endmacro |
||

1359 | |||

1360 | INIT_MMX |
||

1361 | %define PALIGNR PALIGNR_MMX |
||

1362 | PRED8x8L_VERTICAL mmxext |
||

1363 | %define PALIGNR PALIGNR_SSSE3 |
||

1364 | PRED8x8L_VERTICAL ssse3 |
||

1365 | |||

1366 | ;----------------------------------------------------------------------------- |
||

1367 | c249e665 | Daniel Kang | ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride) |

1368 | ;----------------------------------------------------------------------------- |
||

1369 | e8d98764 | Ronald S. Bultje | |

1370 | 536e9b2f | Daniel Kang | INIT_MMX |

1371 | %define PALIGNR PALIGNR_MMX |
||

1372 | cglobal pred8x8l_down_left_mmxext, 4,5 |
||

1373 | sub r0, r3 |
||

1374 | movq mm0, [r0-8] |
||

1375 | movq mm3, [r0] |
||

1376 | movq mm1, [r0+8] |
||

1377 | movq mm2, mm3 |
||

1378 | movq mm4, mm3 |
||

1379 | PALIGNR mm2, mm0, 7, mm0 |
||

1380 | PALIGNR mm1, mm4, 1, mm4 |
||

1381 | test r1, r1 |
||

1382 | jz .fix_lt_2 |
||

1383 | test r2, r2 |
||

1384 | jz .fix_tr_1 |
||

1385 | jmp .do_top |
||

1386 | .fix_lt_2: |
||

1387 | movq mm5, mm3 |
||

1388 | pxor mm5, mm2 |
||

1389 | psllq mm5, 56 |
||

1390 | psrlq mm5, 56 |
||

1391 | pxor mm2, mm5 |
||

1392 | test r2, r2 |
||

1393 | jnz .do_top |
||

1394 | .fix_tr_1: |
||

1395 | movq mm5, mm3 |
||

1396 | pxor mm5, mm1 |
||

1397 | psrlq mm5, 56 |
||

1398 | psllq mm5, 56 |
||

1399 | pxor mm1, mm5 |
||

1400 | jmp .do_top |
||

1401 | .fix_tr_2: |
||

1402 | punpckhbw mm3, mm3 |
||

1403 | pshufw mm1, mm3, 0xFF |
||

1404 | jmp .do_topright |
||

1405 | .do_top: |
||

1406 | PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 |
||

1407 | movq mm7, mm4 |
||

1408 | test r2, r2 |
||

1409 | jz .fix_tr_2 |
||

1410 | movq mm0, [r0+8] |
||

1411 | movq mm5, mm0 |
||

1412 | movq mm2, mm0 |
||

1413 | movq mm4, mm0 |
||

1414 | psrlq mm5, 56 |
||

1415 | PALIGNR mm2, mm3, 7, mm3 |
||

1416 | PALIGNR mm5, mm4, 1, mm4 |
||

1417 | PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 |
||

1418 | .do_topright: |
||

1419 | lea r1, [r0+r3*2] |
||

1420 | movq mm6, mm1 |
||

1421 | psrlq mm1, 56 |
||

1422 | movq mm4, mm1 |
||

1423 | lea r2, [r1+r3*2] |
||

1424 | movq mm2, mm6 |
||

1425 | PALIGNR mm2, mm7, 1, mm0 |
||

1426 | movq mm3, mm6 |
||

1427 | PALIGNR mm3, mm7, 7, mm0 |
||

1428 | PALIGNR mm4, mm6, 1, mm0 |
||

1429 | movq mm5, mm7 |
||

1430 | movq mm1, mm7 |
||

1431 | movq mm7, mm6 |
||

1432 | lea r4, [r2+r3*2] |
||

1433 | psllq mm1, 8 |
||

1434 | PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6 |
||

1435 | PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6 |
||

1436 | movq [r4+r3*2], mm1 |
||

1437 | movq mm2, mm0 |
||

1438 | psllq mm1, 8 |
||

1439 | psrlq mm2, 56 |
||

1440 | psllq mm0, 8 |
||

1441 | por mm1, mm2 |
||

1442 | movq [r4+r3*1], mm1 |
||

1443 | movq mm2, mm0 |
||

1444 | psllq mm1, 8 |
||

1445 | psrlq mm2, 56 |
||

1446 | psllq mm0, 8 |
||

1447 | por mm1, mm2 |
||

1448 | movq [r2+r3*2], mm1 |
||

1449 | movq mm2, mm0 |
||

1450 | psllq mm1, 8 |
||

1451 | psrlq mm2, 56 |
||

1452 | psllq mm0, 8 |
||

1453 | por mm1, mm2 |
||

1454 | movq [r2+r3*1], mm1 |
||

1455 | movq mm2, mm0 |
||

1456 | psllq mm1, 8 |
||

1457 | psrlq mm2, 56 |
||

1458 | psllq mm0, 8 |
||

1459 | por mm1, mm2 |
||

1460 | movq [r1+r3*2], mm1 |
||

1461 | movq mm2, mm0 |
||

1462 | psllq mm1, 8 |
||

1463 | psrlq mm2, 56 |
||

1464 | psllq mm0, 8 |
||

1465 | por mm1, mm2 |
||

1466 | movq [r1+r3*1], mm1 |
||

1467 | movq mm2, mm0 |
||

1468 | psllq mm1, 8 |
||

1469 | psrlq mm2, 56 |
||

1470 | psllq mm0, 8 |
||

1471 | por mm1, mm2 |
||

1472 | movq [r0+r3*2], mm1 |
||

1473 | psllq mm1, 8 |
||

1474 | psrlq mm0, 56 |
||

1475 | por mm1, mm0 |
||

1476 | movq [r0+r3*1], mm1 |
||

1477 | RET |
||

1478 | |||

1479 | c249e665 | Daniel Kang | %macro PRED8x8L_DOWN_LEFT 1 |

1480 | cglobal pred8x8l_down_left_%1, 4,4 |
||

1481 | sub r0, r3 |
||

1482 | movq mm0, [r0-8] |
||

1483 | movq mm3, [r0] |
||

1484 | movq mm1, [r0+8] |
||

1485 | movq mm2, mm3 |
||

1486 | movq mm4, mm3 |
||

1487 | PALIGNR mm2, mm0, 7, mm0 |
||

1488 | PALIGNR mm1, mm4, 1, mm4 |
||

1489 | test r1, r1 ; top_left |
||

1490 | jz .fix_lt_2 |
||

1491 | test r2, r2 ; top_right |
||

1492 | jz .fix_tr_1 |
||

1493 | jmp .do_top |
||

1494 | .fix_lt_2: |
||

1495 | movq mm5, mm3 |
||

1496 | pxor mm5, mm2 |
||

1497 | psllq mm5, 56 |
||

1498 | psrlq mm5, 56 |
||

1499 | pxor mm2, mm5 |
||

1500 | test r2, r2 ; top_right |
||

1501 | jnz .do_top |
||

1502 | .fix_tr_1: |
||

1503 | movq mm5, mm3 |
||

1504 | pxor mm5, mm1 |
||

1505 | psrlq mm5, 56 |
||

1506 | psllq mm5, 56 |
||

1507 | pxor mm1, mm5 |
||

1508 | jmp .do_top |
||

1509 | .fix_tr_2: |
||

1510 | punpckhbw mm3, mm3 |
||

1511 | pshufw mm1, mm3, 0xFF |
||

1512 | jmp .do_topright |
||

1513 | .do_top: |
||

1514 | PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 |
||

1515 | movq2dq xmm3, mm4 |
||

1516 | test r2, r2 ; top_right |
||

1517 | jz .fix_tr_2 |
||

1518 | movq mm0, [r0+8] |
||

1519 | movq mm5, mm0 |
||

1520 | movq mm2, mm0 |
||

1521 | movq mm4, mm0 |
||

1522 | psrlq mm5, 56 |
||

1523 | PALIGNR mm2, mm3, 7, mm3 |
||

1524 | PALIGNR mm5, mm4, 1, mm4 |
||

1525 | PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 |
||

1526 | .do_topright: |
||

1527 | movq2dq xmm4, mm1 |
||

1528 | psrlq mm1, 56 |
||

1529 | movq2dq xmm5, mm1 |
||

1530 | lea r1, [r0+r3*2] |
||

1531 | pslldq xmm4, 8 |
||

1532 | por xmm3, xmm4 |
||

1533 | movdqa xmm2, xmm3 |
||

1534 | psrldq xmm2, 1 |
||

1535 | pslldq xmm5, 15 |
||

1536 | por xmm2, xmm5 |
||

1537 | lea r2, [r1+r3*2] |
||

1538 | movdqa xmm1, xmm3 |
||

1539 | pslldq xmm1, 1 |
||

1540 | INIT_XMM |
||

1541 | PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 |
||

1542 | psrldq xmm0, 1 |
||

1543 | movq [r0+r3*1], xmm0 |
||

1544 | psrldq xmm0, 1 |
||

1545 | movq [r0+r3*2], xmm0 |
||

1546 | psrldq xmm0, 1 |
||

1547 | lea r0, [r2+r3*2] |
||

1548 | movq [r1+r3*1], xmm0 |
||

1549 | psrldq xmm0, 1 |
||

1550 | movq [r1+r3*2], xmm0 |
||

1551 | psrldq xmm0, 1 |
||

1552 | movq [r2+r3*1], xmm0 |
||

1553 | psrldq xmm0, 1 |
||

1554 | movq [r2+r3*2], xmm0 |
||

1555 | psrldq xmm0, 1 |
||

1556 | movq [r0+r3*1], xmm0 |
||

1557 | psrldq xmm0, 1 |
||

1558 | movq [r0+r3*2], xmm0 |
||

1559 | RET |
||

1560 | %endmacro |
||

1561 | |||

1562 | INIT_MMX |
||

1563 | %define PALIGNR PALIGNR_MMX |
||

1564 | PRED8x8L_DOWN_LEFT sse2 |
||

1565 | INIT_MMX |
||

1566 | %define PALIGNR PALIGNR_SSSE3 |
||

1567 | PRED8x8L_DOWN_LEFT ssse3 |
||

1568 | |||

1569 | ;----------------------------------------------------------------------------- |
||

1570 | e916acbc | Daniel Kang | ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride) |

1571 | ;----------------------------------------------------------------------------- |
||

1572 | e8d98764 | Ronald S. Bultje | |

1573 | e916acbc | Daniel Kang | INIT_MMX |

1574 | %define PALIGNR PALIGNR_MMX |
||

1575 | cglobal pred8x8l_down_right_mmxext, 4,5 |
||

1576 | sub r0, r3 |
||

1577 | lea r4, [r0+r3*2] |
||

1578 | movq mm0, [r0+r3*1-8] |
||

1579 | punpckhbw mm0, [r0+r3*0-8] |
||

1580 | movq mm1, [r4+r3*1-8] |
||

1581 | punpckhbw mm1, [r0+r3*2-8] |
||

1582 | mov r4, r0 |
||

1583 | punpckhwd mm1, mm0 |
||

1584 | lea r0, [r0+r3*4] |
||

1585 | movq mm2, [r0+r3*1-8] |
||

1586 | punpckhbw mm2, [r0+r3*0-8] |
||

1587 | lea r0, [r0+r3*2] |
||

1588 | movq mm3, [r0+r3*1-8] |
||

1589 | punpckhbw mm3, [r0+r3*0-8] |
||

1590 | punpckhwd mm3, mm2 |
||

1591 | punpckhdq mm3, mm1 |
||

1592 | lea r0, [r0+r3*2] |
||

1593 | movq mm0, [r0+r3*0-8] |
||

1594 | movq mm1, [r4] |
||

1595 | mov r0, r4 |
||

1596 | movq mm4, mm3 |
||

1597 | movq mm2, mm3 |
||

1598 | PALIGNR mm4, mm0, 7, mm0 |
||

1599 | PALIGNR mm1, mm2, 1, mm2 |
||

1600 | test r1, r1 ; top_left |
||

1601 | jz .fix_lt_1 |
||

1602 | .do_left: |
||

1603 | movq mm0, mm4 |
||

1604 | PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
||

1605 | movq mm4, mm0 |
||

1606 | movq mm7, mm2 |
||

1607 | movq mm6, mm2 |
||

1608 | PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 |
||

1609 | psllq mm1, 56 |
||

1610 | PALIGNR mm7, mm1, 7, mm3 |
||

1611 | movq mm0, [r0-8] |
||

1612 | movq mm3, [r0] |
||

1613 | movq mm1, [r0+8] |
||

1614 | movq mm2, mm3 |
||

1615 | movq mm4, mm3 |
||

1616 | PALIGNR mm2, mm0, 7, mm0 |
||

1617 | PALIGNR mm1, mm4, 1, mm4 |
||

1618 | test r1, r1 ; top_left |
||

1619 | jz .fix_lt_2 |
||

1620 | test r2, r2 ; top_right |
||

1621 | jz .fix_tr_1 |
||

1622 | .do_top: |
||

1623 | PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 |
||

1624 | movq mm5, mm4 |
||

1625 | jmp .body |
||

1626 | .fix_lt_1: |
||

1627 | movq mm5, mm3 |
||

1628 | pxor mm5, mm4 |
||

1629 | psrlq mm5, 56 |
||

1630 | psllq mm5, 48 |
||

1631 | pxor mm1, mm5 |
||

1632 | jmp .do_left |
||

1633 | .fix_lt_2: |
||

1634 | movq mm5, mm3 |
||

1635 | pxor mm5, mm2 |
||

1636 | psllq mm5, 56 |
||

1637 | psrlq mm5, 56 |
||

1638 | pxor mm2, mm5 |
||

1639 | test r2, r2 ; top_right |
||

1640 | jnz .do_top |
||

1641 | .fix_tr_1: |
||

1642 | movq mm5, mm3 |
||

1643 | pxor mm5, mm1 |
||

1644 | psrlq mm5, 56 |
||

1645 | psllq mm5, 56 |
||

1646 | pxor mm1, mm5 |
||

1647 | jmp .do_top |
||

1648 | .body |
||

1649 | lea r1, [r0+r3*2] |
||

1650 | movq mm1, mm7 |
||

1651 | movq mm7, mm5 |
||

1652 | movq mm5, mm6 |
||

1653 | movq mm2, mm7 |
||

1654 | lea r2, [r1+r3*2] |
||

1655 | PALIGNR mm2, mm6, 1, mm0 |
||

1656 | movq mm3, mm7 |
||

1657 | PALIGNR mm3, mm6, 7, mm0 |
||

1658 | movq mm4, mm7 |
||

1659 | lea r4, [r2+r3*2] |
||

1660 | psrlq mm4, 8 |
||

1661 | PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6 |
||

1662 | PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6 |
||

1663 | movq [r4+r3*2], mm0 |
||

1664 | movq mm2, mm1 |
||

1665 | psrlq mm0, 8 |
||

1666 | psllq mm2, 56 |
||

1667 | psrlq mm1, 8 |
||

1668 | por mm0, mm2 |
||

1669 | movq [r4+r3*1], mm0 |
||

1670 | movq mm2, mm1 |
||

1671 | psrlq mm0, 8 |
||

1672 | psllq mm2, 56 |
||

1673 | psrlq mm1, 8 |
||

1674 | por mm0, mm2 |
||

1675 | movq [r2+r3*2], mm0 |
||

1676 | movq mm2, mm1 |
||

1677 | psrlq mm0, 8 |
||

1678 | psllq mm2, 56 |
||

1679 | psrlq mm1, 8 |
||

1680 | por mm0, mm2 |
||

1681 | movq [r2+r3*1], mm0 |
||

1682 | movq mm2, mm1 |
||

1683 | psrlq mm0, 8 |
||

1684 | psllq mm2, 56 |
||

1685 | psrlq mm1, 8 |
||

1686 | por mm0, mm2 |
||

1687 | movq [r1+r3*2], mm0 |
||

1688 | movq mm2, mm1 |
||

1689 | psrlq mm0, 8 |
||

1690 | psllq mm2, 56 |
||

1691 | psrlq mm1, 8 |
||

1692 | por mm0, mm2 |
||

1693 | movq [r1+r3*1], mm0 |
||

1694 | movq mm2, mm1 |
||

1695 | psrlq mm0, 8 |
||

1696 | psllq mm2, 56 |
||

1697 | psrlq mm1, 8 |
||

1698 | por mm0, mm2 |
||

1699 | movq [r0+r3*2], mm0 |
||

1700 | psrlq mm0, 8 |
||

1701 | psllq mm1, 56 |
||

1702 | por mm0, mm1 |
||

1703 | movq [r0+r3*1], mm0 |
||

1704 | RET |
||

1705 | 602a4cb2 | Daniel Kang | |

1706 | %macro PRED8x8L_DOWN_RIGHT 1 |
||

1707 | cglobal pred8x8l_down_right_%1, 4,5 |
||

1708 | sub r0, r3 |
||

1709 | lea r4, [r0+r3*2] |
||

1710 | movq mm0, [r0+r3*1-8] |
||

1711 | punpckhbw mm0, [r0+r3*0-8] |
||

1712 | movq mm1, [r4+r3*1-8] |
||

1713 | punpckhbw mm1, [r0+r3*2-8] |
||

1714 | mov r4, r0 |
||

1715 | punpckhwd mm1, mm0 |
||

1716 | lea r0, [r0+r3*4] |
||

1717 | movq mm2, [r0+r3*1-8] |
||

1718 | punpckhbw mm2, [r0+r3*0-8] |
||

1719 | lea r0, [r0+r3*2] |
||

1720 | movq mm3, [r0+r3*1-8] |
||

1721 | punpckhbw mm3, [r0+r3*0-8] |
||

1722 | punpckhwd mm3, mm2 |
||

1723 | punpckhdq mm3, mm1 |
||

1724 | lea r0, [r0+r3*2] |
||

1725 | movq mm0, [r0+r3*0-8] |
||

1726 | movq mm1, [r4] |
||

1727 | mov r0, r4 |
||

1728 | movq mm4, mm3 |
||

1729 | movq mm2, mm3 |
||

1730 | PALIGNR mm4, mm0, 7, mm0 |
||

1731 | PALIGNR mm1, mm2, 1, mm2 |
||

1732 | test r1, r1 |
||

1733 | jz .fix_lt_1 |
||

1734 | jmp .do_left |
||

1735 | .fix_lt_1: |
||

1736 | movq mm5, mm3 |
||

1737 | pxor mm5, mm4 |
||

1738 | psrlq mm5, 56 |
||

1739 | psllq mm5, 48 |
||

1740 | pxor mm1, mm5 |
||

1741 | jmp .do_left |
||

1742 | .fix_lt_2: |
||

1743 | movq mm5, mm3 |
||

1744 | pxor mm5, mm2 |
||

1745 | psllq mm5, 56 |
||

1746 | psrlq mm5, 56 |
||

1747 | pxor mm2, mm5 |
||

1748 | test r2, r2 |
||

1749 | jnz .do_top |
||

1750 | .fix_tr_1: |
||

1751 | movq mm5, mm3 |
||

1752 | pxor mm5, mm1 |
||

1753 | psrlq mm5, 56 |
||

1754 | psllq mm5, 56 |
||

1755 | pxor mm1, mm5 |
||

1756 | jmp .do_top |
||

1757 | .do_left: |
||

1758 | movq mm0, mm4 |
||

1759 | PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
||

1760 | movq mm4, mm0 |
||

1761 | movq mm7, mm2 |
||

1762 | movq2dq xmm3, mm2 |
||

1763 | PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 |
||

1764 | psllq mm1, 56 |
||

1765 | PALIGNR mm7, mm1, 7, mm3 |
||

1766 | movq2dq xmm1, mm7 |
||

1767 | movq mm0, [r0-8] |
||

1768 | movq mm3, [r0] |
||

1769 | movq mm1, [r0+8] |
||

1770 | movq mm2, mm3 |
||

1771 | movq mm4, mm3 |
||

1772 | PALIGNR mm2, mm0, 7, mm0 |
||

1773 | PALIGNR mm1, mm4, 1, mm4 |
||

1774 | test r1, r1 |
||

1775 | jz .fix_lt_2 |
||

1776 | test r2, r2 |
||

1777 | jz .fix_tr_1 |
||

1778 | .do_top: |
||

1779 | PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 |
||

1780 | movq2dq xmm4, mm4 |
||

1781 | lea r1, [r0+r3*2] |
||

1782 | movdqa xmm0, xmm3 |
||

1783 | pslldq xmm4, 8 |
||

1784 | por xmm3, xmm4 |
||

1785 | lea r2, [r1+r3*2] |
||

1786 | pslldq xmm4, 1 |
||

1787 | por xmm1, xmm4 |
||

1788 | psrldq xmm0, 7 |
||

1789 | pslldq xmm0, 15 |
||

1790 | psrldq xmm0, 7 |
||

1791 | por xmm1, xmm0 |
||

1792 | lea r0, [r2+r3*2] |
||

1793 | movdqa xmm2, xmm3 |
||

1794 | psrldq xmm2, 1 |
||

1795 | INIT_XMM |
||

1796 | PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 |
||

1797 | movdqa xmm1, xmm0 |
||

1798 | psrldq xmm1, 1 |
||

1799 | movq [r0+r3*2], xmm0 |
||

1800 | movq [r0+r3*1], xmm1 |
||

1801 | psrldq xmm0, 2 |
||

1802 | psrldq xmm1, 2 |
||

1803 | movq [r2+r3*2], xmm0 |
||

1804 | movq [r2+r3*1], xmm1 |
||

1805 | psrldq xmm0, 2 |
||

1806 | psrldq xmm1, 2 |
||

1807 | movq [r1+r3*2], xmm0 |
||

1808 | movq [r1+r3*1], xmm1 |
||

1809 | psrldq xmm0, 2 |
||

1810 | psrldq xmm1, 2 |
||

1811 | movq [r4+r3*2], xmm0 |
||

1812 | movq [r4+r3*1], xmm1 |
||

1813 | RET |
||

1814 | %endmacro |
||

1815 | |||

1816 | INIT_MMX |
||

1817 | %define PALIGNR PALIGNR_MMX |
||

1818 | PRED8x8L_DOWN_RIGHT sse2 |
||

1819 | INIT_MMX |
||

1820 | %define PALIGNR PALIGNR_SSSE3 |
||

1821 | PRED8x8L_DOWN_RIGHT ssse3 |
||

1822 | e916acbc | Daniel Kang | |

1823 | ;----------------------------------------------------------------------------- |
||

1824 | f25112fc | Daniel Kang | ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride) |

1825 | ;----------------------------------------------------------------------------- |
||

1826 | e8d98764 | Ronald S. Bultje | |

1827 | f25112fc | Daniel Kang | INIT_MMX |

1828 | %define PALIGNR PALIGNR_MMX |
||

1829 | cglobal pred8x8l_vertical_right_mmxext, 4,5 |
||

1830 | sub r0, r3 |
||

1831 | lea r4, [r0+r3*2] |
||

1832 | movq mm0, [r0+r3*1-8] |
||

1833 | punpckhbw mm0, [r0+r3*0-8] |
||

1834 | movq mm1, [r4+r3*1-8] |
||

1835 | punpckhbw mm1, [r0+r3*2-8] |
||

1836 | mov r4, r0 |
||

1837 | punpckhwd mm1, mm0 |
||

1838 | lea r0, [r0+r3*4] |
||

1839 | movq mm2, [r0+r3*1-8] |
||

1840 | punpckhbw mm2, [r0+r3*0-8] |
||

1841 | lea r0, [r0+r3*2] |
||

1842 | movq mm3, [r0+r3*1-8] |
||

1843 | punpckhbw mm3, [r0+r3*0-8] |
||

1844 | punpckhwd mm3, mm2 |
||

1845 | punpckhdq mm3, mm1 |
||

1846 | lea r0, [r0+r3*2] |
||

1847 | movq mm0, [r0+r3*0-8] |
||

1848 | movq mm1, [r4] |
||

1849 | mov r0, r4 |
||

1850 | movq mm4, mm3 |
||

1851 | movq mm2, mm3 |
||

1852 | PALIGNR mm4, mm0, 7, mm0 |
||

1853 | PALIGNR mm1, mm2, 1, mm2 |
||

1854 | test r1, r1 |
||

1855 | jz .fix_lt_1 |
||

1856 | jmp .do_left |
||

1857 | .fix_lt_1: |
||

1858 | movq mm5, mm3 |
||

1859 | pxor mm5, mm4 |
||

1860 | psrlq mm5, 56 |
||

1861 | psllq mm5, 48 |
||

1862 | pxor mm1, mm5 |
||

1863 | jmp .do_left |
||

1864 | .fix_lt_2: |
||

1865 | movq mm5, mm3 |
||

1866 | pxor mm5, mm2 |
||

1867 | psllq mm5, 56 |
||

1868 | psrlq mm5, 56 |
||

1869 | pxor mm2, mm5 |
||

1870 | test r2, r2 |
||

1871 | jnz .do_top |
||

1872 | .fix_tr_1: |
||

1873 | movq mm5, mm3 |
||

1874 | pxor mm5, mm1 |
||

1875 | psrlq mm5, 56 |
||

1876 | psllq mm5, 56 |
||

1877 | pxor mm1, mm5 |
||

1878 | jmp .do_top |
||

1879 | .do_left: |
||

1880 | movq mm0, mm4 |
||

1881 | PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
||

1882 | movq mm7, mm2 |
||

1883 | movq mm0, [r0-8] |
||

1884 | movq mm3, [r0] |
||

1885 | movq mm1, [r0+8] |
||

1886 | movq mm2, mm3 |
||

1887 | movq mm4, mm3 |
||

1888 | PALIGNR mm2, mm0, 7, mm0 |
||

1889 | PALIGNR mm1, mm4, 1, mm4 |
||

1890 | test r1, r1 |
||

1891 | jz .fix_lt_2 |
||

1892 | test r2, r2 |
||

1893 | jz .fix_tr_1 |
||

1894 | .do_top |
||

1895 | PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 |
||

1896 | lea r1, [r0+r3*2] |
||

1897 | movq mm2, mm6 |
||

1898 | movq mm3, mm6 |
||

1899 | PALIGNR mm3, mm7, 7, mm0 |
||

1900 | PALIGNR mm6, mm7, 6, mm1 |
||

1901 | movq mm4, mm3 |
||

1902 | pavgb mm3, mm2 |
||

1903 | lea r2, [r1+r3*2] |
||

1904 | PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5 |
||

1905 | movq [r0+r3*1], mm3 |
||

1906 | movq [r0+r3*2], mm0 |
||

1907 | movq mm5, mm0 |
||

1908 | movq mm6, mm3 |
||

1909 | movq mm1, mm7 |
||

1910 | movq mm2, mm1 |
||

1911 | psllq mm2, 8 |
||

1912 | movq mm3, mm1 |
||

1913 | psllq mm3, 16 |
||

1914 | lea r4, [r2+r3*2] |
||

1915 | PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4 |
||

1916 | PALIGNR mm6, mm0, 7, mm2 |
||

1917 | movq [r1+r3*1], mm6 |
||

1918 | psllq mm0, 8 |
||

1919 | PALIGNR mm5, mm0, 7, mm1 |
||

1920 | movq [r1+r3*2], mm5 |
||

1921 | psllq mm0, 8 |
||

1922 | PALIGNR mm6, mm0, 7, mm2 |
||

1923 | movq [r2+r3*1], mm6 |
||

1924 | psllq mm0, 8 |
||

1925 | PALIGNR mm5, mm0, 7, mm1 |
||

1926 | movq [r2+r3*2], mm5 |
||

1927 | psllq mm0, 8 |
||

1928 | PALIGNR mm6, mm0, 7, mm2 |
||

1929 | movq [r4+r3*1], mm6 |
||

1930 | psllq mm0, 8 |
||

1931 | PALIGNR mm5, mm0, 7, mm1 |
||

1932 | movq [r4+r3*2], mm5 |
||

1933 | RET |
||

1934 | bdd93f1b | Daniel Kang | |

1935 | %macro PRED8x8L_VERTICAL_RIGHT 1 |
||

1936 | cglobal pred8x8l_vertical_right_%1, 4,5,7 |
||

1937 | sub r0, r3 |
||

1938 | lea r4, [r0+r3*2] |
||

1939 | movq mm0, [r0+r3*1-8] |
||

1940 | punpckhbw mm0, [r0+r3*0-8] |
||

1941 | movq mm1, [r4+r3*1-8] |
||

1942 | punpckhbw mm1, [r0+r3*2-8] |
||

1943 | mov r4, r0 |
||

1944 | punpckhwd mm1, mm0 |
||

1945 | lea r0, [r0+r3*4] |
||

1946 | movq mm2, [r0+r3*1-8] |
||

1947 | punpckhbw mm2, [r0+r3*0-8] |
||

1948 | lea r0, [r0+r3*2] |
||

1949 | movq mm3, [r0+r3*1-8] |
||

1950 | punpckhbw mm3, [r0+r3*0-8] |
||

1951 | punpckhwd mm3, mm2 |
||

1952 | punpckhdq mm3, mm1 |
||

1953 | lea r0, [r0+r3*2] |
||

1954 | movq mm0, [r0+r3*0-8] |
||

1955 | movq mm1, [r4] |
||

1956 | mov r0, r4 |
||

1957 | movq mm4, mm3 |
||

1958 | movq mm2, mm3 |
||

1959 | PALIGNR mm4, mm0, 7, mm0 |
||

1960 | PALIGNR mm1, mm2, 1, mm2 |
||

1961 | test r1, r1 |
||

1962 | jnz .do_left |
||

1963 | .fix_lt_1: |
||

1964 | movq mm5, mm3 |
||

1965 | pxor mm5, mm4 |
||

1966 | psrlq mm5, 56 |
||

1967 | psllq mm5, 48 |
||

1968 | pxor mm1, mm5 |
||

1969 | jmp .do_left |
||

1970 | .fix_lt_2: |
||

1971 | movq mm5, mm3 |
||

1972 | pxor mm5, mm2 |
||

1973 | psllq mm5, 56 |
||

1974 | psrlq mm5, 56 |
||

1975 | pxor mm2, mm5 |
||

1976 | test r2, r2 |
||

1977 | jnz .do_top |
||

1978 | .fix_tr_1: |
||

1979 | movq mm5, mm3 |
||

1980 | pxor mm5, mm1 |
||

1981 | psrlq mm5, 56 |
||

1982 | psllq mm5, 56 |
||

1983 | pxor mm1, mm5 |
||

1984 | jmp .do_top |
||

1985 | .do_left: |
||

1986 | movq mm0, mm4 |
||

1987 | PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
||

1988 | movq2dq xmm0, mm2 |
||

1989 | movq mm0, [r0-8] |
||

1990 | movq mm3, [r0] |
||

1991 | movq mm1, [r0+8] |
||

1992 | movq mm2, mm3 |
||

1993 | movq mm4, mm3 |
||

1994 | PALIGNR mm2, mm0, 7, mm0 |
||

1995 | PALIGNR mm1, mm4, 1, mm4 |
||

1996 | test r1, r1 |
||

1997 | jz .fix_lt_2 |
||

1998 | test r2, r2 |
||

1999 | jz .fix_tr_1 |
||

2000 | .do_top |
||

2001 | PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 |
||

2002 | lea r1, [r0+r3*2] |
||

2003 | movq2dq xmm4, mm6 |
||

2004 | pslldq xmm4, 8 |
||

2005 | por xmm0, xmm4 |
||

2006 | movdqa xmm6, [pw_ff00] |
||

2007 | movdqa xmm1, xmm0 |
||

2008 | lea r2, [r1+r3*2] |
||

2009 | movdqa xmm2, xmm0 |
||

2010 | movdqa xmm3, xmm0 |
||

2011 | pslldq xmm0, 1 |
||

2012 | pslldq xmm1, 2 |
||

2013 | pavgb xmm2, xmm0 |
||

2014 | INIT_XMM |
||

2015 | PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5 |
||

2016 | pandn xmm6, xmm4 |
||

2017 | movdqa xmm5, xmm4 |
||

2018 | psrlw xmm4, 8 |
||

2019 | packuswb xmm6, xmm4 |
||

2020 | movhlps xmm4, xmm6 |
||

2021 | movhps [r0+r3*2], xmm5 |
||

2022 | movhps [r0+r3*1], xmm2 |
||

2023 | psrldq xmm5, 4 |
||

2024 | movss xmm5, xmm6 |
||

2025 | psrldq xmm2, 4 |
||

2026 | movss xmm2, xmm4 |
||

2027 | lea r0, [r2+r3*2] |
||

2028 | psrldq xmm5, 1 |
||

2029 | psrldq xmm2, 1 |
||

2030 | movq [r0+r3*2], xmm5 |
||

2031 | movq [r0+r3*1], xmm2 |
||

2032 | psrldq xmm5, 1 |
||

2033 | psrldq xmm2, 1 |
||

2034 | movq [r2+r3*2], xmm5 |
||

2035 | movq [r2+r3*1], xmm2 |
||

2036 | psrldq xmm5, 1 |
||

2037 | psrldq xmm2, 1 |
||

2038 | movq [r1+r3*2], xmm5 |
||

2039 | movq [r1+r3*1], xmm2 |
||

2040 | RET |
||

2041 | %endmacro |
||

2042 | |||

2043 | INIT_MMX |
||

2044 | %define PALIGNR PALIGNR_MMX |
||

2045 | PRED8x8L_VERTICAL_RIGHT sse2 |
||

2046 | INIT_MMX |
||

2047 | %define PALIGNR PALIGNR_SSSE3 |
||

2048 | PRED8x8L_VERTICAL_RIGHT ssse3 |
||

2049 | f25112fc | Daniel Kang | |

2050 | ;----------------------------------------------------------------------------- |
||

2051 | ecc7efbb | Daniel Kang | ;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride) |

2052 | ;----------------------------------------------------------------------------- |
||

2053 | e8d98764 | Ronald S. Bultje | |

2054 | ecc7efbb | Daniel Kang | %macro PRED8x8L_VERTICAL_LEFT 1 |

2055 | cglobal pred8x8l_vertical_left_%1, 4,4 |
||

2056 | sub r0, r3 |
||

2057 | movq mm0, [r0-8] |
||

2058 | movq mm3, [r0] |
||

2059 | movq mm1, [r0+8] |
||

2060 | movq mm2, mm3 |
||

2061 | movq mm4, mm3 |
||

2062 | PALIGNR mm2, mm0, 7, mm0 |
||

2063 | PALIGNR mm1, mm4, 1, mm4 |
||

2064 | test r1, r1 |
||

2065 | jz .fix_lt_2 |
||

2066 | test r2, r2 |
||

2067 | jz .fix_tr_1 |
||

2068 | jmp .do_top |
||

2069 | .fix_lt_2: |
||

2070 | movq mm5, mm3 |
||

2071 | pxor mm5, mm2 |
||

2072 | psllq mm5, 56 |
||

2073 | psrlq mm5, 56 |
||

2074 | pxor mm2, mm5 |
||

2075 | test r2, r2 |
||

2076 | jnz .do_top |
||

2077 | .fix_tr_1: |
||

2078 | movq mm5, mm3 |
||

2079 | pxor mm5, mm1 |
||

2080 | psrlq mm5, 56 |
||

2081 | psllq mm5, 56 |
||

2082 | pxor mm1, mm5 |
||

2083 | jmp .do_top |
||

2084 | .fix_tr_2: |
||

2085 | punpckhbw mm3, mm3 |
||

2086 | pshufw mm1, mm3, 0xFF |
||

2087 | jmp .do_topright |
||

2088 | .do_top: |
||

2089 | PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 |
||

2090 | movq2dq xmm4, mm4 |
||

2091 | test r2, r2 |
||

2092 | jz .fix_tr_2 |
||

2093 | movq mm0, [r0+8] |
||

2094 | movq mm5, mm0 |
||

2095 | movq mm2, mm0 |
||

2096 | movq mm4, mm0 |
||

2097 | psrlq mm5, 56 |
||

2098 | PALIGNR mm2, mm3, 7, mm3 |
||

2099 | PALIGNR mm5, mm4, 1, mm4 |
||

2100 | PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 |
||

2101 | .do_topright: |
||

2102 | movq2dq xmm3, mm1 |
||

2103 | lea r1, [r0+r3*2] |
||

2104 | pslldq xmm3, 8 |
||

2105 | por xmm4, xmm3 |
||

2106 | movdqa xmm2, xmm4 |
||

2107 | movdqa xmm1, xmm4 |
||

2108 | movdqa xmm3, xmm4 |
||

2109 | psrldq xmm2, 1 |
||

2110 | pslldq xmm1, 1 |
||

2111 | pavgb xmm3, xmm2 |
||

2112 | lea r2, [r1+r3*2] |
||

2113 | INIT_XMM |
||

2114 | PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5 |
||

2115 | psrldq xmm0, 1 |
||

2116 | movq [r0+r3*1], xmm3 |
||

2117 | movq [r0+r3*2], xmm0 |
||

2118 | lea r0, [r2+r3*2] |
||

2119 | psrldq xmm3, 1 |
||

2120 | psrldq xmm0, 1 |
||

2121 | movq [r1+r3*1], xmm3 |
||

2122 | movq [r1+r3*2], xmm0 |
||

2123 | psrldq xmm3, 1 |
||

2124 | psrldq xmm0, 1 |
||

2125 | movq [r2+r3*1], xmm3 |
||

2126 | movq [r2+r3*2], xmm0 |
||

2127 | psrldq xmm3, 1 |
||

2128 | psrldq xmm0, 1 |
||

2129 | movq [r0+r3*1], xmm3 |
||

2130 | movq [r0+r3*2], xmm0 |
||

2131 | RET |
||

2132 | %endmacro |
||

2133 | |||

2134 | INIT_MMX |
||

2135 | %define PALIGNR PALIGNR_MMX |
||

2136 | PRED8x8L_VERTICAL_LEFT sse2 |
||

2137 | %define PALIGNR PALIGNR_SSSE3 |
||

2138 | INIT_MMX |
||

2139 | PRED8x8L_VERTICAL_LEFT ssse3 |
||

2140 | |||

2141 | ;----------------------------------------------------------------------------- |
||

2142 | 98c6053c | Daniel Kang | ; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride) |

2143 | ;----------------------------------------------------------------------------- |
||

2144 | e8d98764 | Ronald S. Bultje | |

2145 | 98c6053c | Daniel Kang | %macro PRED8x8L_HORIZONTAL_UP 1 |

2146 | cglobal pred8x8l_horizontal_up_%1, 4,4 |
||

2147 | sub r0, r3 |
||

2148 | lea r2, [r0+r3*2] |
||

2149 | movq mm0, [r0+r3*1-8] |
||

2150 | b9c7f66e | Ronald S. Bultje | test r1, r1 |

2151 | lea r1, [r0+r3] |
||

2152 | cmovnz r1, r0 |
||

2153 | punpckhbw mm0, [r1+r3*0-8] |
||

2154 | 98c6053c | Daniel Kang | movq mm1, [r2+r3*1-8] |

2155 | punpckhbw mm1, [r0+r3*2-8] |
||

2156 | mov r2, r0 |
||

2157 | punpckhwd mm1, mm0 |
||

2158 | lea r0, [r0+r3*4] |
||

2159 | movq mm2, [r0+r3*1-8] |
||

2160 | punpckhbw mm2, [r0+r3*0-8] |
||

2161 | lea r0, [r0+r3*2] |
||

2162 | movq mm3, [r0+r3*1-8] |
||

2163 | punpckhbw mm3, [r0+r3*0-8] |
||

2164 | punpckhwd mm3, mm2 |
||

2165 | punpckhdq mm3, mm1 |
||

2166 | lea r0, [r0+r3*2] |
||

2167 | movq mm0, [r0+r3*0-8] |
||

2168 | b9c7f66e | Ronald S. Bultje | movq mm1, [r1+r3*0-8] |

2169 | 98c6053c | Daniel Kang | mov r0, r2 |

2170 | movq mm4, mm3 |
||

2171 | movq mm2, mm3 |
||

2172 | PALIGNR mm4, mm0, 7, mm0 |
||

2173 | PALIGNR mm1, mm2, 1, mm2 |
||

2174 | movq mm0, mm4 |
||

2175 | PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
||

2176 | movq mm4, mm0 |
||

2177 | movq mm7, mm2 |
||

2178 | PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 |
||

2179 | psllq mm1, 56 |
||

2180 | PALIGNR mm7, mm1, 7, mm3 |
||

2181 | lea r1, [r0+r3*2] |
||

2182 | pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 |
||

2183 | psllq mm7, 56 ; l7 .. .. .. .. .. .. .. |
||

2184 | movq mm2, mm0 |
||

2185 | psllw mm0, 8 |
||

2186 | psrlw mm2, 8 |
||

2187 | por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 |
||

2188 | movq mm3, mm2 |
||

2189 | movq mm4, mm2 |
||

2190 | movq mm5, mm2 |
||

2191 | psrlq mm2, 8 |
||

2192 | psrlq mm3, 16 |
||

2193 | lea r2, [r1+r3*2] |
||

2194 | por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1 |
||

2195 | punpckhbw mm7, mm7 |
||

2196 | por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2 |
||

2197 | pavgb mm4, mm2 |
||

2198 | PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6 |
||

2199 | movq mm5, mm4 |
||

2200 | punpcklbw mm4, mm1 ; p4 p3 p2 p1 |
||

2201 | punpckhbw mm5, mm1 ; p8 p7 p6 p5 |
||

2202 | movq mm6, mm5 |
||

2203 | movq mm7, mm5 |
||

2204 | movq mm0, mm5 |
||

2205 | PALIGNR mm5, mm4, 2, mm1 |
||

2206 | pshufw mm1, mm6, 11111001b |
||

2207 | PALIGNR mm6, mm4, 4, mm2 |
||

2208 | pshufw mm2, mm7, 11111110b |
||

2209 | PALIGNR mm7, mm4, 6, mm3 |
||

2210 | pshufw mm3, mm0, 11111111b |
||

2211 | movq [r0+r3*1], mm4 |
||

2212 | movq [r0+r3*2], mm5 |
||

2213 | lea r0, [r2+r3*2] |
||

2214 | movq [r1+r3*1], mm6 |
||

2215 | movq [r1+r3*2], mm7 |
||

2216 | movq [r2+r3*1], mm0 |
||

2217 | movq [r2+r3*2], mm1 |
||

2218 | movq [r0+r3*1], mm2 |
||

2219 | movq [r0+r3*2], mm3 |
||

2220 | RET |
||

2221 | %endmacro |
||

2222 | |||

2223 | INIT_MMX |
||

2224 | %define PALIGNR PALIGNR_MMX |
||

2225 | PRED8x8L_HORIZONTAL_UP mmxext |
||

2226 | %define PALIGNR PALIGNR_SSSE3 |
||

2227 | PRED8x8L_HORIZONTAL_UP ssse3 |
||

2228 | |||

2229 | ;----------------------------------------------------------------------------- |
||

2230 | 04cbdf3d | Daniel Kang | ;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride) |

2231 | ;----------------------------------------------------------------------------- |
||

2232 | e8d98764 | Ronald S. Bultje | |

2233 | 04cbdf3d | Daniel Kang | INIT_MMX |

2234 | %define PALIGNR PALIGNR_MMX |
||

2235 | cglobal pred8x8l_horizontal_down_mmxext, 4,5 |
||

2236 | sub r0, r3 |
||

2237 | lea r4, [r0+r3*2] |
||

2238 | movq mm0, [r0+r3*1-8] |
||

2239 | punpckhbw mm0, [r0+r3*0-8] |
||

2240 | movq mm1, [r4+r3*1-8] |
||

2241 | punpckhbw mm1, [r0+r3*2-8] |
||

2242 | mov r4, r0 |
||

2243 | punpckhwd mm1, mm0 |
||

2244 | lea r0, [r0+r3*4] |
||

2245 | movq mm2, [r0+r3*1-8] |
||

2246 | punpckhbw mm2, [r0+r3*0-8] |
||

2247 | lea r0, [r0+r3*2] |
||

2248 | movq mm3, [r0+r3*1-8] |
||

2249 | punpckhbw mm3, [r0+r3*0-8] |
||

2250 | punpckhwd mm3, mm2 |
||

2251 | punpckhdq mm3, mm1 |
||

2252 | lea r0, [r0+r3*2] |
||

2253 | movq mm0, [r0+r3*0-8] |
||

2254 | movq mm1, [r4] |
||

2255 | mov r0, r4 |
||

2256 | movq mm4, mm3 |
||

2257 | movq mm2, mm3 |
||

2258 | PALIGNR mm4, mm0, 7, mm0 |
||

2259 | PALIGNR mm1, mm2, 1, mm2 |
||

2260 | test r1, r1 |
||

2261 | jnz .do_left |
||

2262 | .fix_lt_1: |
||

2263 | movq mm5, mm3 |
||

2264 | pxor mm5, mm4 |
||

2265 | psrlq mm5, 56 |
||

2266 | psllq mm5, 48 |
||

2267 | pxor mm1, mm5 |
||

2268 | jmp .do_left |
||

2269 | .fix_lt_2: |
||

2270 | movq mm5, mm3 |
||

2271 | pxor mm5, mm2 |
||

2272 | psllq mm5, 56 |
||

2273 | psrlq mm5, 56 |
||

2274 | pxor mm2, mm5 |
||

2275 | test r2, r2 |
||

2276 | jnz .do_top |
||

2277 | .fix_tr_1: |
||

2278 | movq mm5, mm3 |
||

2279 | pxor mm5, mm1 |
||

2280 | psrlq mm5, 56 |
||

2281 | psllq mm5, 56 |
||

2282 | pxor mm1, mm5 |
||

2283 | jmp .do_top |
||

2284 | .do_left: |
||

2285 | movq mm0, mm4 |
||

2286 | PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
||

2287 | movq mm4, mm0 |
||

2288 | movq mm7, mm2 |
||

2289 | movq mm6, mm2 |
||

2290 | PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 |
||

2291 | psllq mm1, 56 |
||

2292 | PALIGNR mm7, mm1, 7, mm3 |
||

2293 | movq mm0, [r0-8] |
||

2294 | movq mm3, [r0] |
||

2295 | movq mm1, [r0+8] |
||

2296 | movq mm2, mm3 |
||

2297 | movq mm4, mm3 |
||

2298 | PALIGNR mm2, mm0, 7, mm0 |
||

2299 | PALIGNR mm1, mm4, 1, mm4 |
||

2300 | test r1, r1 |
||

2301 | jz .fix_lt_2 |
||

2302 | test r2, r2 |
||

2303 | jz .fix_tr_1 |
||

2304 | .do_top: |
||

2305 | PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 |
||

2306 | movq mm5, mm4 |
||

2307 | lea r1, [r0+r3*2] |
||

2308 | psllq mm7, 56 |
||

2309 | movq mm2, mm5 |
||

2310 | movq mm3, mm6 |
||

2311 | movq mm4, mm2 |
||

2312 | PALIGNR mm2, mm6, 7, mm5 |
||

2313 | PALIGNR mm6, mm7, 7, mm0 |
||

2314 | lea r2, [r1+r3*2] |
||

2315 | PALIGNR mm4, mm3, 1, mm7 |
||

2316 | movq mm5, mm3 |
||

2317 | pavgb mm3, mm6 |
||

2318 | PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7 |
||

2319 | movq mm4, mm2 |
||

2320 | movq mm1, mm2 |
||

2321 | lea r4, [r2+r3*2] |
||

2322 | psrlq mm4, 16 |
||

2323 | psrlq mm1, 8 |
||

2324 | PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5 |
||

2325 | movq mm7, mm3 |
||

2326 | punpcklbw mm3, mm0 |
||

2327 | punpckhbw mm7, mm0 |
||

2328 | movq mm1, mm7 |
||

2329 | movq mm0, mm7 |
||

2330 | movq mm4, mm7 |
||

2331 | movq [r4+r3*2], mm3 |
||

2332 | PALIGNR mm7, mm3, 2, mm5 |
||

2333 | movq [r4+r3*1], mm7 |
||

2334 | PALIGNR mm1, mm3, 4, mm5 |
||

2335 | movq [r2+r3*2], mm1 |
||

2336 | PALIGNR mm0, mm3, 6, mm3 |
||

2337 | movq [r2+r3*1], mm0 |
||

2338 | movq mm2, mm6 |
||

2339 | movq mm3, mm6 |
||

2340 | movq [r1+r3*2], mm4 |
||

2341 | PALIGNR mm6, mm4, 2, mm5 |
||

2342 | movq [r1+r3*1], mm6 |
||

2343 | PALIGNR mm2, mm4, 4, mm5 |
||

2344 | movq [r0+r3*2], mm2 |
||

2345 | PALIGNR mm3, mm4, 6, mm4 |
||

2346 | movq [r0+r3*1], mm3 |
||

2347 | RET |
||

2348 | 57b1f334 | Daniel Kang | |

2349 | %macro PRED8x8L_HORIZONTAL_DOWN 1 |
||

2350 | cglobal pred8x8l_horizontal_down_%1, 4,5 |
||

2351 | sub r0, r3 |
||

2352 | lea r4, [r0+r3*2] |
||

2353 | movq mm0, [r0+r3*1-8] |
||

2354 | punpckhbw mm0, [r0+r3*0-8] |
||

2355 | movq mm1, [r4+r3*1-8] |
||

2356 | punpckhbw mm1, [r0+r3*2-8] |
||

2357 | mov r4, r0 |
||

2358 | punpckhwd mm1, mm0 |
||

2359 | lea r0, [r0+r3*4] |
||

2360 | movq mm2, [r0+r3*1-8] |
||

2361 | punpckhbw mm2, [r0+r3*0-8] |
||

2362 | lea r0, [r0+r3*2] |
||

2363 | movq mm3, [r0+r3*1-8] |
||

2364 | punpckhbw mm3, [r0+r3*0-8] |
||

2365 | punpckhwd mm3, mm2 |
||

2366 | punpckhdq mm3, mm1 |
||

2367 | lea r0, [r0+r3*2] |
||

2368 | movq mm0, [r0+r3*0-8] |
||

2369 | movq mm1, [r4] |
||

2370 | mov r0, r4 |
||

2371 | movq mm4, mm3 |
||

2372 | movq mm2, mm3 |
||

2373 | PALIGNR mm4, mm0, 7, mm0 |
||

2374 | PALIGNR mm1, mm2, 1, mm2 |
||

2375 | test r1, r1 |
||

2376 | jnz .do_left |
||

2377 | .fix_lt_1: |
||

2378 | movq mm5, mm3 |
||

2379 | pxor mm5, mm4 |
||

2380 | psrlq mm5, 56 |
||

2381 | psllq mm5, 48 |
||

2382 | pxor mm1, mm5 |
||

2383 | jmp .do_left |
||

2384 | .fix_lt_2: |
||

2385 | movq mm5, mm3 |
||

2386 | pxor mm5, mm2 |
||

2387 | psllq mm5, 56 |
||

2388 | psrlq mm5, 56 |
||

2389 | pxor mm2, mm5 |
||

2390 | test r2, r2 |
||

2391 | jnz .do_top |
||

2392 | .fix_tr_1: |
||

2393 | movq mm5, mm3 |
||

2394 | pxor mm5, mm1 |
||

2395 | psrlq mm5, 56 |
||

2396 | psllq mm5, 56 |
||

2397 | pxor mm1, mm5 |
||

2398 | jmp .do_top |
||

2399 | .fix_tr_2: |
||

2400 | punpckhbw mm3, mm3 |
||

2401 | pshufw mm1, mm3, 0xFF |
||

2402 | jmp .do_topright |
||

2403 | .do_left: |
||

2404 | movq mm0, mm4 |
||

2405 | PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
||

2406 | movq2dq xmm0, mm2 |
||

2407 | pslldq xmm0, 8 |
||

2408 | movq mm4, mm0 |
||

2409 | PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 |
||

2410 | movq2dq xmm2, mm1 |
||

2411 | pslldq xmm2, 15 |
||

2412 | psrldq xmm2, 8 |
||

2413 | por xmm0, xmm2 |
||

2414 | movq mm0, [r0-8] |
||

2415 | movq mm3, [r0] |
||

2416 | movq mm1, [r0+8] |
||

2417 | movq mm2, mm3 |
||

2418 | movq mm4, mm3 |
||

2419 | PALIGNR mm2, mm0, 7, mm0 |
||

2420 | PALIGNR mm1, mm4, 1, mm4 |
||

2421 | test r1, r1 |
||

2422 | jz .fix_lt_2 |
||

2423 | test r2, r2 |
||

2424 | jz .fix_tr_1 |
||

2425 | .do_top: |
||

2426 | PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 |
||

2427 | movq2dq xmm1, mm4 |
||

2428 | test r2, r2 |
||

2429 | jz .fix_tr_2 |
||

2430 | movq mm0, [r0+8] |
||

2431 | movq mm5, mm0 |
||

2432 | movq mm2, mm0 |
||

2433 | movq mm4, mm0 |
||

2434 | psrlq mm5, 56 |
||

2435 | PALIGNR mm2, mm3, 7, mm3 |
||

2436 | PALIGNR mm5, mm4, 1, mm4 |
||

2437 | PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 |
||

2438 | .do_topright: |
||

2439 | movq2dq xmm5, mm1 |
||

2440 | pslldq xmm5, 8 |
||

2441 | por xmm1, xmm5 |
||

2442 | INIT_XMM |
||

2443 | lea r2, [r4+r3*2] |
||

2444 | movdqa xmm2, xmm1 |
||

2445 | movdqa xmm3, xmm1 |
||

2446 | PALIGNR xmm1, xmm0, 7, xmm4 |
||

2447 | PALIGNR xmm2, xmm0, 9, xmm5 |
||

2448 | lea r1, [r2+r3*2] |
||

2449 | PALIGNR xmm3, xmm0, 8, xmm0 |
||

2450 | movdqa xmm4, xmm1 |
||

2451 | pavgb xmm4, xmm3 |
||

2452 | lea r0, [r1+r3*2] |
||

2453 | PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5 |
||

2454 | punpcklbw xmm4, xmm0 |
||

2455 | movhlps xmm0, xmm4 |
||

2456 | movq [r0+r3*2], xmm4 |
||

2457 | movq [r2+r3*2], xmm0 |
||

2458 | psrldq xmm4, 2 |
||

2459 | psrldq xmm0, 2 |
||

2460 | movq [r0+r3*1], xmm4 |
||

2461 | movq [r2+r3*1], xmm0 |
||

2462 | psrldq xmm4, 2 |
||

2463 | psrldq xmm0, 2 |
||

2464 | movq [r1+r3*2], xmm4 |
||

2465 | movq [r4+r3*2], xmm0 |
||

2466 | psrldq xmm4, 2 |
||

2467 | psrldq xmm0, 2 |
||

2468 | movq [r1+r3*1], xmm4 |
||

2469 | movq [r4+r3*1], xmm0 |
||

2470 | RET |
||

2471 | %endmacro |
||

2472 | |||

2473 | INIT_MMX |
||

2474 | %define PALIGNR PALIGNR_MMX |
||

2475 | PRED8x8L_HORIZONTAL_DOWN sse2 |
||

2476 | INIT_MMX |
||

2477 | %define PALIGNR PALIGNR_SSSE3 |
||

2478 | PRED8x8L_HORIZONTAL_DOWN ssse3 |
||

2479 | 04cbdf3d | Daniel Kang | %endif |

2480 | |||

2481 | ;----------------------------------------------------------------------------- |
||

2482 | 8b746bb4 | Jason Garrett-Glaser | ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride) |

2483 | ;----------------------------------------------------------------------------- |
||

2484 | |||

2485 | 270a85d2 | Jason Garrett-Glaser | cglobal pred4x4_dc_mmxext, 3,5 |

2486 | pxor mm7, mm7 |
||

2487 | mov r4, r0 |
||

2488 | sub r0, r2 |
||

2489 | movd mm0, [r0] |
||

2490 | psadbw mm0, mm7 |
||

2491 | movzx r1d, byte [r0+r2*1-1] |
||

2492 | movd r3d, mm0 |
||

2493 | add r3d, r1d |
||

2494 | movzx r1d, byte [r0+r2*2-1] |
||

2495 | lea r0, [r0+r2*2] |
||

2496 | add r3d, r1d |
||

2497 | movzx r1d, byte [r0+r2*1-1] |
||

2498 | add r3d, r1d |
||

2499 | movzx r1d, byte [r0+r2*2-1] |
||

2500 | add r3d, r1d |
||

2501 | add r3d, 4 |
||

2502 | shr r3d, 3 |
||

2503 | imul r3d, 0x01010101 |
||

2504 | mov [r4+r2*0], r3d |
||

2505 | mov [r0+r2*0], r3d |
||

2506 | mov [r0+r2*1], r3d |
||

2507 | mov [r0+r2*2], r3d |
||

2508 | RET |
||

2509 | fb9927ad | Jason Garrett-Glaser | |

2510 | ;----------------------------------------------------------------------------- |
||

2511 | ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
||

2512 | ;----------------------------------------------------------------------------- |
||

2513 | |||

2514 | %macro PRED4x4_TM_MMX 1 |
||

2515 | cglobal pred4x4_tm_vp8_%1, 3,6 |
||

2516 | sub r0, r2 |
||

2517 | pxor mm7, mm7 |
||

2518 | movd mm0, [r0] |
||

2519 | punpcklbw mm0, mm7 |
||

2520 | movzx r4d, byte [r0-1] |
||

2521 | mov r5d, 2 |
||

2522 | .loop: |
||

2523 | movzx r1d, byte [r0+r2*1-1] |
||

2524 | movzx r3d, byte [r0+r2*2-1] |
||

2525 | sub r1d, r4d |
||

2526 | sub r3d, r4d |
||

2527 | movd mm2, r1d |
||

2528 | movd mm4, r3d |
||

2529 | %ifidn %1, mmx |
||

2530 | punpcklwd mm2, mm2 |
||

2531 | punpcklwd mm4, mm4 |
||

2532 | punpckldq mm2, mm2 |
||

2533 | punpckldq mm4, mm4 |
||

2534 | %else |
||

2535 | pshufw mm2, mm2, 0 |
||

2536 | pshufw mm4, mm4, 0 |
||

2537 | %endif |
||

2538 | paddw mm2, mm0 |
||

2539 | paddw mm4, mm0 |
||

2540 | packuswb mm2, mm2 |
||

2541 | packuswb mm4, mm4 |
||

2542 | movd [r0+r2*1], mm2 |
||

2543 | movd [r0+r2*2], mm4 |
||

2544 | lea r0, [r0+r2*2] |
||

2545 | dec r5d |
||

2546 | jg .loop |
||

2547 | REP_RET |
||

2548 | %endmacro |
||

2549 | |||

2550 | PRED4x4_TM_MMX mmx |
||

2551 | PRED4x4_TM_MMX mmxext |
||

2552 | |||

2553 | cglobal pred4x4_tm_vp8_ssse3, 3,3 |
||

2554 | sub r0, r2 |
||

2555 | movq mm6, [tm_shuf] |
||

2556 | pxor mm1, mm1 |
||

2557 | movd mm0, [r0] |
||

2558 | punpcklbw mm0, mm1 |
||

2559 | movd mm7, [r0-4] |
||

2560 | pshufb mm7, mm6 |
||

2561 | lea r1, [r0+r2*2] |
||

2562 | movd mm2, [r0+r2*1-4] |
||

2563 | movd mm3, [r0+r2*2-4] |
||

2564 | movd mm4, [r1+r2*1-4] |
||

2565 | movd mm5, [r1+r2*2-4] |
||

2566 | pshufb mm2, mm6 |
||

2567 | pshufb mm3, mm6 |
||

2568 | pshufb mm4, mm6 |
||

2569 | pshufb mm5, mm6 |
||

2570 | psubw mm2, mm7 |
||

2571 | psubw mm3, mm7 |
||

2572 | psubw mm4, mm7 |
||

2573 | psubw mm5, mm7 |
||

2574 | paddw mm2, mm0 |
||

2575 | paddw mm3, mm0 |
||

2576 | paddw mm4, mm0 |
||

2577 | paddw mm5, mm0 |
||

2578 | packuswb mm2, mm2 |
||

2579 | packuswb mm3, mm3 |
||

2580 | packuswb mm4, mm4 |
||

2581 | packuswb mm5, mm5 |
||

2582 | movd [r0+r2*1], mm2 |
||

2583 | movd [r0+r2*2], mm3 |
||

2584 | movd [r1+r2*1], mm4 |
||

2585 | movd [r1+r2*2], mm5 |
||

2586 | RET |
||

2587 | bc14f04b | Jason Garrett-Glaser | |

2588 | ;----------------------------------------------------------------------------- |
||

2589 | ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
||

2590 | ;----------------------------------------------------------------------------- |
||

2591 | |||

2592 | INIT_MMX |
||

2593 | cglobal pred4x4_vertical_vp8_mmxext, 3,3 |
||

2594 | sub r0, r2 |
||

2595 | movd m1, [r0-1] |
||

2596 | movd m0, [r0] |
||

2597 | mova m2, m0 ;t0 t1 t2 t3 |
||

2598 | punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7 |
||

2599 | lea r1, [r0+r2*2] |
||

2600 | psrlq m0, 8 ;t1 t2 t3 t4 |
||

2601 | PRED4x4_LOWPASS m3, m1, m0, m2, m4 |
||

2602 | movd [r0+r2*1], m3 |
||

2603 | movd [r0+r2*2], m3 |
||

2604 | movd [r1+r2*1], m3 |
||

2605 | movd [r1+r2*2], m3 |
||

2606 | RET |
||

2607 | 911b32f4 | Daniel Kang | |

2608 | ;----------------------------------------------------------------------------- |
||

2609 | ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
||

2610 | ;----------------------------------------------------------------------------- |
||

2611 | 98928c83 | Ronald S. Bultje | %ifdef CONFIG_GPL |

2612 | 911b32f4 | Daniel Kang | INIT_MMX |

2613 | cglobal pred4x4_down_left_mmxext, 3,3 |
||

2614 | sub r0, r2 |
||

2615 | movq m1, [r0] |
||

2616 | punpckldq m1, [r1] |
||

2617 | movq m2, m1 |
||

2618 | movq m3, m1 |
||

2619 | movq m4, m1 |
||

2620 | psllq m1, 8 |
||

2621 | pxor m2, m1 |
||

2622 | psrlq m2, 8 |
||

2623 | pxor m3, m2 |
||

2624 | PRED4x4_LOWPASS m0, m1, m3, m4, m5 |
||

2625 | lea r1, [r0+r2*2] |
||

2626 | psrlq m0, 8 |
||

2627 | movd [r0+r2*1], m0 |
||

2628 | psrlq m0, 8 |
||

2629 | movd [r0+r2*2], m0 |
||

2630 | psrlq m0, 8 |
||

2631 | movd [r1+r2*1], m0 |
||

2632 | psrlq m0, 8 |
||

2633 | movd [r1+r2*2], m0 |
||

2634 | RET |
||

2635 | 92f441ae | Daniel Kang | |

2636 | ;----------------------------------------------------------------------------- |
||

2637 | ; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
||

2638 | ;----------------------------------------------------------------------------- |
||

2639 | |||

2640 | INIT_MMX |
||

2641 | cglobal pred4x4_vertical_left_mmxext, 3,3 |
||

2642 | sub r0, r2 |
||

2643 | movq m1, [r0] |
||

2644 | punpckldq m1, [r1] |
||

2645 | movq m3, m1 |
||

2646 | movq m2, m1 |
||

2647 | psrlq m3, 8 |
||

2648 | psrlq m2, 16 |
||

2649 | movq m4, m3 |
||

2650 | pavgb m4, m1 |
||

2651 | PRED4x4_LOWPASS m0, m1, m2, m3, m5 |
||

2652 | lea r1, [r0+r2*2] |
||

2653 | movh [r0+r2*1], m4 |
||

2654 | movh [r0+r2*2], m0 |
||

2655 | psrlq m4, 8 |
||

2656 | psrlq m0, 8 |
||

2657 | movh [r1+r2*1], m4 |
||

2658 | movh [r1+r2*2], m0 |
||

2659 | RET |
||

2660 | e9c576a4 | Daniel Kang | |

2661 | ;----------------------------------------------------------------------------- |
||

2662 | ; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
||

2663 | ;----------------------------------------------------------------------------- |
||

2664 | |||

2665 | INIT_MMX |
||

2666 | cglobal pred4x4_horizontal_up_mmxext, 3,3 |
||

2667 | sub r0, r2 |
||

2668 | lea r1, [r0+r2*2] |
||

2669 | 0790caba | Daniel Kang | movd m0, [r0+r2*1-4] |

2670 | punpcklbw m0, [r0+r2*2-4] |
||

2671 | movd m1, [r1+r2*1-4] |
||

2672 | punpcklbw m1, [r1+r2*2-4] |
||

2673 | e9c576a4 | Daniel Kang | punpckhwd m0, m1 |

2674 | movq m1, m0 |
||

2675 | punpckhbw m1, m1 |
||

2676 | pshufw m1, m1, 0xFF |
||

2677 | punpckhdq m0, m1 |
||

2678 | movq m2, m0 |
||

2679 | movq m3, m0 |
||

2680 | movq m7, m0 |
||

2681 | psrlq m2, 16 |
||

2682 | psrlq m3, 8 |
||

2683 | pavgb m7, m3 |
||

2684 | PRED4x4_LOWPASS m4, m0, m2, m3, m5 |
||

2685 | punpcklbw m7, m4 |
||

2686 | movd [r0+r2*1], m7 |
||

2687 | psrlq m7, 16 |
||

2688 | movd [r0+r2*2], m7 |
||

2689 | psrlq m7, 16 |
||

2690 | movd [r1+r2*1], m7 |
||

2691 | movd [r1+r2*2], m1 |
||

2692 | RET |
||

2693 | 76497232 | Daniel Kang | |

2694 | ;----------------------------------------------------------------------------- |
||

2695 | ; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
||

2696 | ;----------------------------------------------------------------------------- |
||

2697 | |||

2698 | INIT_MMX |
||

2699 | %define PALIGNR PALIGNR_MMX |
||

2700 | cglobal pred4x4_horizontal_down_mmxext, 3,3 |
||

2701 | sub r0, r2 |
||

2702 | lea r1, [r0+r2*2] |
||

2703 | movh m0, [r0-4] ; lt .. |
||

2704 | punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. .. |
||

2705 | psllq m0, 8 ; t2 t1 t0 lt .. .. .. .. |
||

2706 | 0790caba | Daniel Kang | movd m1, [r1+r2*2-4] ; l3 |

2707 | punpcklbw m1, [r1+r2*1-4] ; l2 l3 |
||

2708 | movd m2, [r0+r2*2-4] ; l1 |
||

2709 | punpcklbw m2, [r0+r2*1-4] ; l0 l1 |
||

2710 | 76497232 | Daniel Kang | punpckhwd m1, m2 ; l0 l1 l2 l3 |

2711 | punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 |
||

2712 | movq m0, m1 |
||

2713 | movq m2, m1 |
||

2714 | movq m5, m1 |
||

2715 | psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1 |
||

2716 | psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2 |
||

2717 | pavgb m5, m2 |
||

2718 | PRED4x4_LOWPASS m3, m1, m0, m2, m4 |
||

2719 | punpcklbw m5, m3 |
||

2720 | psrlq m3, 32 |
||

2721 | PALIGNR m3, m5, 6, m4 |
||

2722 | movh [r1+r2*2], m5 |
||

2723 | psrlq m5, 16 |
||

2724 | movh [r1+r2*1], m5 |
||

2725 | psrlq m5, 16 |
||

2726 | movh [r0+r2*2], m5 |
||

2727 | movh [r0+r2*1], m3 |
||

2728 | RET |
||

2729 | d0aebe23 | Daniel Kang | |

2730 | ;----------------------------------------------------------------------------- |
||

2731 | ; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
||

2732 | ;----------------------------------------------------------------------------- |
||

2733 | |||

2734 | INIT_MMX |
||

2735 | %define PALIGNR PALIGNR_MMX |
||

2736 | cglobal pred4x4_vertical_right_mmxext, 3,3 |
||

2737 | sub r0, r2 |
||

2738 | lea r1, [r0+r2*2] |
||

2739 | movh m0, [r0] ; ........t3t2t1t0 |
||

2740 | movq m5, m0 |
||

2741 | PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt |
||

2742 | pavgb m5, m0 |
||

2743 | PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0 |
||

2744 | movq m1, m0 |
||

2745 | PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1 |
||

2746 | movq m2, m0 |
||

2747 | PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2 |
||

2748 | PRED4x4_LOWPASS m3, m1, m0, m2, m4 |
||

2749 | movq m1, m3 |
||

2750 | psrlq m3, 16 |
||

2751 | psllq m1, 48 |
||

2752 | movh [r0+r2*1], m5 |
||

2753 | movh [r0+r2*2], m3 |
||

2754 | PALIGNR m5, m1, 7, m2 |
||

2755 | psllq m1, 8 |
||

2756 | movh [r1+r2*1], m5 |
||

2757 | PALIGNR m3, m1, 7, m1 |
||

2758 | movh [r1+r2*2], m3 |
||

2759 | RET |
||

2760 | 720ea2d5 | Daniel Kang | |

2761 | ;----------------------------------------------------------------------------- |
||

2762 | ; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
||

2763 | ;----------------------------------------------------------------------------- |
||

2764 | |||

2765 | INIT_MMX |
||

2766 | %define PALIGNR PALIGNR_MMX |
||

2767 | cglobal pred4x4_down_right_mmxext, 3,3 |
||

2768 | sub r0, r2 |
||

2769 | lea r1, [r0+r2*2] |
||

2770 | movq m1, [r1-8] |
||

2771 | movq m2, [r0+r2*1-8] |
||

2772 | punpckhbw m2, [r0-8] |
||

2773 | movh m3, [r0] |
||

2774 | punpckhwd m1, m2 |
||

2775 | PALIGNR m3, m1, 5, m1 |
||

2776 | movq m1, m3 |
||

2777 | PALIGNR m3, [r1+r2*1-8], 7, m4 |
||

2778 | movq m2, m3 |
||

2779 | PALIGNR m3, [r1+r2*2-8], 7, m4 |
||

2780 | PRED4x4_LOWPASS m0, m3, m1, m2, m4 |
||

2781 | movh [r1+r2*2], m0 |
||

2782 | psrlq m0, 8 |
||

2783 | movh [r1+r2*1], m0 |
||

2784 | psrlq m0, 8 |
||

2785 | movh [r0+r2*2], m0 |
||

2786 | psrlq m0, 8 |
||

2787 | movh [r0+r2*1], m0 |
||

2788 | RET |
||

2789 | 98928c83 | Ronald S. Bultje | %endif |