;****************************************************************************** |
;* MMX optimized DSP utils |

;* Copyright (c) 2008 Loren Merritt |

;* |

;* This file is part of FFmpeg. |

;* |

;* FFmpeg is free software; you can redistribute it and/or |

;* modify it under the terms of the GNU Lesser General Public |

;* License as published by the Free Software Foundation; either |

;* version 2.1 of the License, or (at your option) any later version. |

;* |

;* FFmpeg is distributed in the hope that it will be useful, |

;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

;* Lesser General Public License for more details. |

;* |

;* You should have received a copy of the GNU Lesser General Public |

;* License along with FFmpeg; if not, write to the Free Software |

;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

;****************************************************************************** |

%include "x86inc.asm" |

SECTION_RODATA |

pb_f: times 16 db 15 |

pb_zzzzzzzz77777777: times 8 db -1 |

pb_7: times 8 db 7 |

pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 |

pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 |

section .text align=16 |

%macro PSWAPD_SSE 2 |

pshufw %1, %2, 0x4e |

%endmacro |

%macro PSWAPD_3DN1 2 |

movq %1, %2 |

psrlq %1, 32 |

punpckldq %1, %2 |

%endmacro |

%macro FLOAT_TO_INT16_INTERLEAVE6 1 |

; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) |

cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 |

%ifdef ARCH_X86_64 |

%define lend r10d |

mov lend, r2d |

%else |

%define lend dword r2m |

%endif |

mov src1q, [srcq+1*gprsize] |

mov src2q, [srcq+2*gprsize] |

mov src3q, [srcq+3*gprsize] |

mov src4q, [srcq+4*gprsize] |

mov src5q, [srcq+5*gprsize] |

mov srcq, [srcq] |

sub src1q, srcq |

sub src2q, srcq |

sub src3q, srcq |

sub src4q, srcq |

sub src5q, srcq |

.loop: |

cvtps2pi mm0, [srcq] |

cvtps2pi mm1, [srcq+src1q] |

cvtps2pi mm2, [srcq+src2q] |

cvtps2pi mm3, [srcq+src3q] |

cvtps2pi mm4, [srcq+src4q] |

cvtps2pi mm5, [srcq+src5q] |

packssdw mm0, mm3 |

packssdw mm1, mm4 |

packssdw mm2, mm5 |

pswapd mm3, mm0 |

punpcklwd mm0, mm1 |

punpckhwd mm1, mm2 |

punpcklwd mm2, mm3 |

pswapd mm3, mm0 |

punpckldq mm0, mm2 |

punpckhdq mm2, mm1 |

punpckldq mm1, mm3 |

movq [dstq ], mm0 |

movq [dstq+16], mm2 |

movq [dstq+ 8], mm1 |

add srcq, 8 |

add dstq, 24 |

sub lend, 2 |

jg .loop |

emms |

RET |

%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 |

%define pswapd PSWAPD_SSE |

FLOAT_TO_INT16_INTERLEAVE6 sse |

%define cvtps2pi pf2id |

%define pswapd PSWAPD_3DN1 |

FLOAT_TO_INT16_INTERLEAVE6 3dnow |

%undef pswapd |

FLOAT_TO_INT16_INTERLEAVE6 3dn2 |

%undef cvtps2pi |

%macro SCALARPRODUCT 1 |

; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) |

cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift |

shl orderq, 1 |

add v1q, orderq |

add v2q, orderq |

neg orderq |

movd m3, shiftm |

pxor m2, m2 |

.loop: |

movu m0, [v1q + orderq] |

movu m1, [v1q + orderq + mmsize] |

pmaddwd m0, [v2q + orderq] |

pmaddwd m1, [v2q + orderq + mmsize] |

paddd m2, m0 |

paddd m2, m1 |

add orderq, mmsize*2 |

jl .loop |

%if mmsize == 16 |

movhlps m0, m2 |

paddd m2, m0 |

psrad m2, m3 |

pshuflw m0, m2, 0x4e |

%else |

psrad m2, m3 |

pshufw m0, m2, 0x4e |

%endif |

paddd m2, m0 |

movd eax, m2 |

RET |

; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) |

cglobal scalarproduct_and_madd_int16_%1, 3,4,8, v1, v2, v3, order, mul |

shl orderq, 1 |

movd m7, mulm |

%if mmsize == 16 |

pshuflw m7, m7, 0 |

punpcklqdq m7, m7 |

%else |

pshufw m7, m7, 0 |

%endif |

pxor m6, m6 |

add v1q, orderq |

add v2q, orderq |

add v3q, orderq |

neg orderq |

.loop: |

movu m0, [v2q + orderq] |

movu m1, [v2q + orderq + mmsize] |

mova m4, [v1q + orderq] |

mova m5, [v1q + orderq + mmsize] |

movu m2, [v3q + orderq] |

movu m3, [v3q + orderq + mmsize] |

pmaddwd m0, m4 |

pmaddwd m1, m5 |

pmullw m2, m7 |

pmullw m3, m7 |

paddd m6, m0 |

paddd m6, m1 |

paddw m2, m4 |

paddw m3, m5 |

mova [v1q + orderq], m2 |

mova [v1q + orderq + mmsize], m3 |

add orderq, mmsize*2 |

jl .loop |

%if mmsize == 16 |

movhlps m0, m6 |

paddd m6, m0 |

pshuflw m0, m6, 0x4e |

%else |

pshufw m0, m6, 0x4e |

%endif |

paddd m6, m0 |

movd eax, m6 |

RET |

%endmacro |

179 |
180 |
181 |
182 |
183 | |

%macro SCALARPRODUCT_LOOP 1 |

align 16 |

.loop%1: |

sub orderq, mmsize*2 |

%if %1 |

mova m1, m4 |

mova m4, [v2q + orderq] |

mova m0, [v2q + orderq + mmsize] |

palignr m1, m0, %1 |

palignr m0, m4, %1 |

mova m3, m5 |

mova m5, [v3q + orderq] |

mova m2, [v3q + orderq + mmsize] |

palignr m3, m2, %1 |

palignr m2, m5, %1 |

%else |

mova m0, [v2q + orderq] |

mova m1, [v2q + orderq + mmsize] |

mova m2, [v3q + orderq] |

mova m3, [v3q + orderq + mmsize] |

%endif |

%define t0 [v1q + orderq] |

206 |
%define t1 [v1q + orderq + mmsize] |

207 |
%ifdef ARCH_X86_64 |

208 |
mova m8, t0 |

209 |
mova m9, t1 |

210 |
%define t0 m8 |

211 |
%define t1 m9 |

212 |
%endif |

213 |
pmaddwd m0, t0 |

214 |
pmaddwd m1, t1 |

215 |
pmullw m2, m7 |

216 |
pmullw m3, m7 |

217 |
paddw m2, t0 |

218 |
paddw m3, t1 |

219 |
paddd m6, m0 |

220 |
paddd m6, m1 |

221 |
mova [v1q + orderq], m2 |

222 |
mova [v1q + orderq + mmsize], m3 |

223 |
jg .loop%1 |

224 |
%if %1 |

225 |
jmp .end |

226 |
%endif |

227 |
%endmacro |

228 | |

229 |
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) |

230 |
cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul |

231 |
shl orderq, 1 |

232 |
movd m7, mulm |

233 |
pshuflw m7, m7, 0 |

234 |
punpcklqdq m7, m7 |

235 |
pxor m6, m6 |

236 |
mov r4d, v2d |

237 |
and r4d, 15 |

238 |
and v2q, ~15 |

239 |
and v3q, ~15 |

240 |
mova m4, [v2q + orderq] |

241 |
mova m5, [v3q + orderq] |

242 |
; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) |

243 |
cmp r4d, 0 |

244 |
je .loop0 |

245 |
cmp r4d, 2 |

246 |
je .loop2 |

247 |
cmp r4d, 4 |

248 |
je .loop4 |

249 |
cmp r4d, 6 |

250 |
je .loop6 |

251 |
cmp r4d, 8 |

252 |
je .loop8 |

253 |
cmp r4d, 10 |

254 |
je .loop10 |

255 |
cmp r4d, 12 |

256 |
je .loop12 |

257 |
SCALARPRODUCT_LOOP 14 |

258 |
SCALARPRODUCT_LOOP 12 |

259 |
SCALARPRODUCT_LOOP 10 |

260 |
SCALARPRODUCT_LOOP 8 |

261 |
SCALARPRODUCT_LOOP 6 |

262 |
SCALARPRODUCT_LOOP 4 |

263 |
SCALARPRODUCT_LOOP 2 |

264 |
SCALARPRODUCT_LOOP 0 |

265 |
.end: |

266 |
movhlps m0, m6 |

267 |
paddd m6, m0 |

268 |
pshuflw m0, m6, 0x4e |

269 |
paddd m6, m0 |

270 |
movd eax, m6 |

271 |
RET |

272 | |

273 | |

274 | |

275 |
; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) |

276 |
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top |

277 |
movq mm0, [topq] |

278 |
movq mm2, mm0 |

279 |
movd mm4, [left_topq] |

280 |
psllq mm2, 8 |

281 |
movq mm1, mm0 |

282 |
por mm4, mm2 |

283 |
movd mm3, [leftq] |

284 |
psubb mm0, mm4 ; t-tl |

285 |
add dstq, wq |

286 |
add topq, wq |

287 |
add diffq, wq |

288 |
neg wq |

289 |
jmp .skip |

290 |
.loop: |

291 |
movq mm4, [topq+wq] |

292 |
movq mm0, mm4 |

293 |
psllq mm4, 8 |

294 |
por mm4, mm1 |

295 |
movq mm1, mm0 ; t |

296 |
psubb mm0, mm4 ; t-tl |

297 |
.skip: |

298 |
movq mm2, [diffq+wq] |

299 |
%assign i 0 |

300 |
%rep 8 |

301 |
movq mm4, mm0 |

302 |
paddb mm4, mm3 ; t-tl+l |

303 |
movq mm5, mm3 |

304 |
pmaxub mm3, mm1 |

305 |
pminub mm5, mm1 |

306 |
pminub mm3, mm4 |

307 |
pmaxub mm3, mm5 ; median |

308 |
paddb mm3, mm2 ; +residual |

309 |
%if i==0 |

310 |
movq mm7, mm3 |

311 |
psllq mm7, 56 |

312 |
%else |

313 |
movq mm6, mm3 |

314 |
psrlq mm7, 8 |

315 |
psllq mm6, 56 |

316 |
por mm7, mm6 |

317 |
%endif |

318 |
%if i<7 |

319 |
psrlq mm0, 8 |

320 |
psrlq mm1, 8 |

321 |
psrlq mm2, 8 |

322 |
%endif |

323 |
%assign i i+1 |

324 |
%endrep |

325 |
movq [dstq+wq], mm7 |

326 |
add wq, 8 |

327 |
jl .loop |

328 |
movzx r2d, byte [dstq-1] |

329 |
mov [leftq], r2d |

330 |
movzx r2d, byte [topq-1] |

331 |
mov [left_topq], r2d |

332 |
RET |

333 | |

334 | |

335 |
%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned |

336 |
add srcq, wq |

337 |
add dstq, wq |

338 |
neg wq |

339 |
%%.loop: |

340 |
mova m1, [srcq+wq] |

341 |
mova m2, m1 |

342 |
psllw m1, 8 |

343 |
paddb m1, m2 |

344 |
mova m2, m1 |

345 |
pshufb m1, m3 |

346 |
paddb m1, m2 |

347 |
pshufb m0, m5 |

348 |
mova m2, m1 |

349 |
pshufb m1, m4 |

350 |
paddb m1, m2 |

351 |
%if mmsize == 16 |

352 |
mova m2, m1 |

353 |
pshufb m1, m6 |

354 |
paddb m1, m2 |

355 |
%endif |

356 |
paddb m0, m1 |

357 |
%if %1 |

358 |
mova [dstq+wq], m0 |

359 |
%else |

360 |
movq [dstq+wq], m0 |

361 |
movhps [dstq+wq+8], m0 |

362 |
%endif |

363 |
add wq, mmsize |

364 |
jl %%.loop |

365 |
mov eax, mmsize-1 |

366 |
sub eax, wd |

367 |
movd m1, eax |

368 |
pshufb m0, m1 |

369 |
movd eax, m0 |

370 |
RET |

371 |
%endmacro |

372 | |

373 |
; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) |

374 |
INIT_MMX |

375 |
cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left |

376 |
.skip_prologue: |

377 |
mova m5, [pb_7 GLOBAL] |

378 |
mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] |

379 |
mova m3, [pb_zz11zz55zz99zzdd GLOBAL] |

380 |
movd m0, leftm |

381 |
psllq m0, 56 |

382 |
ADD_HFYU_LEFT_LOOP 1 |

383 | |

384 |
INIT_XMM |

385 |
cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left |

386 |
mova m5, [pb_f GLOBAL] |

387 |
mova m6, [pb_zzzzzzzz77777777 GLOBAL] |

388 |
mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] |

389 |
mova m3, [pb_zz11zz55zz99zzdd GLOBAL] |

390 |
movd m0, leftm |

391 |
pslldq m0, 15 |

392 |
test srcq, 15 |

393 |
jnz add_hfyu_left_prediction_ssse3.skip_prologue |

394 |
test dstq, 15 |

395 |
jnz .unaligned |

396 |
ADD_HFYU_LEFT_LOOP 1 |

397 |
.unaligned: |

398 |
ADD_HFYU_LEFT_LOOP 0 |

399 |