ffmpeg / libavcodec / i386 / dsputil_mmx_avg.h @ 755bfeab
History  View  Annotate  Download (36.3 KB)
1 
/*


2 
* DSP utils : average functions are compiled twice for 3dnow/mmx2

3 
* Copyright (c) 2000, 2001 Fabrice Bellard.

4 
* Copyright (c) 20022004 Michael Niedermayer

5 
*

6 
* This file is part of FFmpeg.

7 
*

8 
* FFmpeg is free software; you can redistribute it and/or

9 
* modify it under the terms of the GNU Lesser General Public

10 
* License as published by the Free Software Foundation; either

11 
* version 2.1 of the License, or (at your option) any later version.

12 
*

13 
* FFmpeg is distributed in the hope that it will be useful,

14 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

15 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

16 
* Lesser General Public License for more details.

17 
*

18 
* You should have received a copy of the GNU Lesser General Public

19 
* License along with FFmpeg; if not, write to the Free Software

20 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

21 
*

22 
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>

23 
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>

24 
* and improved by Zdenek Kabelac <kabi@users.sf.net>

25 
*/

26  
27 
/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm

28 
clobber bug  now it will work with 2.95.2 and also with fPIC

29 
*/

30 
static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
31 
{ 
32 
__asm __volatile( 
33 
"lea (%3, %3), %%"REG_a" \n\t" 
34 
"1: \n\t"

35 
"movq (%1), %%mm0 \n\t"

36 
"movq (%1, %3), %%mm1 \n\t"

37 
PAVGB" 1(%1), %%mm0 \n\t"

38 
PAVGB" 1(%1, %3), %%mm1 \n\t"

39 
"movq %%mm0, (%2) \n\t"

40 
"movq %%mm1, (%2, %3) \n\t"

41 
"add %%"REG_a", %1 \n\t" 
42 
"add %%"REG_a", %2 \n\t" 
43 
"movq (%1), %%mm0 \n\t"

44 
"movq (%1, %3), %%mm1 \n\t"

45 
PAVGB" 1(%1), %%mm0 \n\t"

46 
PAVGB" 1(%1, %3), %%mm1 \n\t"

47 
"add %%"REG_a", %1 \n\t" 
48 
"movq %%mm0, (%2) \n\t"

49 
"movq %%mm1, (%2, %3) \n\t"

50 
"add %%"REG_a", %2 \n\t" 
51 
"subl $4, %0 \n\t"

52 
"jnz 1b \n\t"

53 
:"+g"(h), "+S"(pixels), "+D"(block) 
54 
:"r" ((long)line_size) 
55 
:"%"REG_a, "memory"); 
56 
} 
57  
58 
static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 
59 
{ 
60 
__asm __volatile( 
61 
"testl $1, %0 \n\t"

62 
" jz 1f \n\t"

63 
"movd (%1), %%mm0 \n\t"

64 
"movd (%2), %%mm1 \n\t"

65 
"add %4, %1 \n\t"

66 
"add $4, %2 \n\t"

67 
PAVGB" %%mm1, %%mm0 \n\t"

68 
"movd %%mm0, (%3) \n\t"

69 
"add %5, %3 \n\t"

70 
"decl %0 \n\t"

71 
"1: \n\t"

72 
"movd (%1), %%mm0 \n\t"

73 
"add %4, %1 \n\t"

74 
"movd (%1), %%mm1 \n\t"

75 
"movd (%2), %%mm2 \n\t"

76 
"movd 4(%2), %%mm3 \n\t"

77 
"add %4, %1 \n\t"

78 
PAVGB" %%mm2, %%mm0 \n\t"

79 
PAVGB" %%mm3, %%mm1 \n\t"

80 
"movd %%mm0, (%3) \n\t"

81 
"add %5, %3 \n\t"

82 
"movd %%mm1, (%3) \n\t"

83 
"add %5, %3 \n\t"

84 
"movd (%1), %%mm0 \n\t"

85 
"add %4, %1 \n\t"

86 
"movd (%1), %%mm1 \n\t"

87 
"movd 8(%2), %%mm2 \n\t"

88 
"movd 12(%2), %%mm3 \n\t"

89 
"add %4, %1 \n\t"

90 
PAVGB" %%mm2, %%mm0 \n\t"

91 
PAVGB" %%mm3, %%mm1 \n\t"

92 
"movd %%mm0, (%3) \n\t"

93 
"add %5, %3 \n\t"

94 
"movd %%mm1, (%3) \n\t"

95 
"add %5, %3 \n\t"

96 
"add $16, %2 \n\t"

97 
"subl $4, %0 \n\t"

98 
"jnz 1b \n\t"

99 
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 
100 
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
101 
#else

102 
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
103 
#endif

104 
:"S"((long)src1Stride), "D"((long)dstStride) 
105 
:"memory");

106 
} 
107  
108  
109 
static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 
110 
{ 
111 
__asm __volatile( 
112 
"testl $1, %0 \n\t"

113 
" jz 1f \n\t"

114 
"movq (%1), %%mm0 \n\t"

115 
"movq (%2), %%mm1 \n\t"

116 
"add %4, %1 \n\t"

117 
"add $8, %2 \n\t"

118 
PAVGB" %%mm1, %%mm0 \n\t"

119 
"movq %%mm0, (%3) \n\t"

120 
"add %5, %3 \n\t"

121 
"decl %0 \n\t"

122 
"1: \n\t"

123 
"movq (%1), %%mm0 \n\t"

124 
"add %4, %1 \n\t"

125 
"movq (%1), %%mm1 \n\t"

126 
"add %4, %1 \n\t"

127 
PAVGB" (%2), %%mm0 \n\t"

128 
PAVGB" 8(%2), %%mm1 \n\t"

129 
"movq %%mm0, (%3) \n\t"

130 
"add %5, %3 \n\t"

131 
"movq %%mm1, (%3) \n\t"

132 
"add %5, %3 \n\t"

133 
"movq (%1), %%mm0 \n\t"

134 
"add %4, %1 \n\t"

135 
"movq (%1), %%mm1 \n\t"

136 
"add %4, %1 \n\t"

137 
PAVGB" 16(%2), %%mm0 \n\t"

138 
PAVGB" 24(%2), %%mm1 \n\t"

139 
"movq %%mm0, (%3) \n\t"

140 
"add %5, %3 \n\t"

141 
"movq %%mm1, (%3) \n\t"

142 
"add %5, %3 \n\t"

143 
"add $32, %2 \n\t"

144 
"subl $4, %0 \n\t"

145 
"jnz 1b \n\t"

146 
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 
147 
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
148 
#else

149 
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
150 
#endif

151 
:"S"((long)src1Stride), "D"((long)dstStride) 
152 
:"memory");

153 
//the following should be used, though better not with gcc ...

154 
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)

155 
:"r"(src1Stride), "r"(dstStride)

156 
:"memory");*/

157 
} 
158  
159 
static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 
160 
{ 
161 
__asm __volatile( 
162 
"pcmpeqb %%mm6, %%mm6 \n\t"

163 
"testl $1, %0 \n\t"

164 
" jz 1f \n\t"

165 
"movq (%1), %%mm0 \n\t"

166 
"movq (%2), %%mm1 \n\t"

167 
"add %4, %1 \n\t"

168 
"add $8, %2 \n\t"

169 
"pxor %%mm6, %%mm0 \n\t"

170 
"pxor %%mm6, %%mm1 \n\t"

171 
PAVGB" %%mm1, %%mm0 \n\t"

172 
"pxor %%mm6, %%mm0 \n\t"

173 
"movq %%mm0, (%3) \n\t"

174 
"add %5, %3 \n\t"

175 
"decl %0 \n\t"

176 
"1: \n\t"

177 
"movq (%1), %%mm0 \n\t"

178 
"add %4, %1 \n\t"

179 
"movq (%1), %%mm1 \n\t"

180 
"add %4, %1 \n\t"

181 
"movq (%2), %%mm2 \n\t"

182 
"movq 8(%2), %%mm3 \n\t"

183 
"pxor %%mm6, %%mm0 \n\t"

184 
"pxor %%mm6, %%mm1 \n\t"

185 
"pxor %%mm6, %%mm2 \n\t"

186 
"pxor %%mm6, %%mm3 \n\t"

187 
PAVGB" %%mm2, %%mm0 \n\t"

188 
PAVGB" %%mm3, %%mm1 \n\t"

189 
"pxor %%mm6, %%mm0 \n\t"

190 
"pxor %%mm6, %%mm1 \n\t"

191 
"movq %%mm0, (%3) \n\t"

192 
"add %5, %3 \n\t"

193 
"movq %%mm1, (%3) \n\t"

194 
"add %5, %3 \n\t"

195 
"movq (%1), %%mm0 \n\t"

196 
"add %4, %1 \n\t"

197 
"movq (%1), %%mm1 \n\t"

198 
"add %4, %1 \n\t"

199 
"movq 16(%2), %%mm2 \n\t"

200 
"movq 24(%2), %%mm3 \n\t"

201 
"pxor %%mm6, %%mm0 \n\t"

202 
"pxor %%mm6, %%mm1 \n\t"

203 
"pxor %%mm6, %%mm2 \n\t"

204 
"pxor %%mm6, %%mm3 \n\t"

205 
PAVGB" %%mm2, %%mm0 \n\t"

206 
PAVGB" %%mm3, %%mm1 \n\t"

207 
"pxor %%mm6, %%mm0 \n\t"

208 
"pxor %%mm6, %%mm1 \n\t"

209 
"movq %%mm0, (%3) \n\t"

210 
"add %5, %3 \n\t"

211 
"movq %%mm1, (%3) \n\t"

212 
"add %5, %3 \n\t"

213 
"add $32, %2 \n\t"

214 
"subl $4, %0 \n\t"

215 
"jnz 1b \n\t"

216 
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 
217 
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
218 
#else

219 
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
220 
#endif

221 
:"S"((long)src1Stride), "D"((long)dstStride) 
222 
:"memory");

223 
//the following should be used, though better not with gcc ...

224 
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)

225 
:"r"(src1Stride), "r"(dstStride)

226 
:"memory");*/

227 
} 
228  
229 
static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 
230 
{ 
231 
__asm __volatile( 
232 
"testl $1, %0 \n\t"

233 
" jz 1f \n\t"

234 
"movd (%1), %%mm0 \n\t"

235 
"movd (%2), %%mm1 \n\t"

236 
"add %4, %1 \n\t"

237 
"add $4, %2 \n\t"

238 
PAVGB" %%mm1, %%mm0 \n\t"

239 
PAVGB" (%3), %%mm0 \n\t"

240 
"movd %%mm0, (%3) \n\t"

241 
"add %5, %3 \n\t"

242 
"decl %0 \n\t"

243 
"1: \n\t"

244 
"movd (%1), %%mm0 \n\t"

245 
"add %4, %1 \n\t"

246 
"movd (%1), %%mm1 \n\t"

247 
"add %4, %1 \n\t"

248 
PAVGB" (%2), %%mm0 \n\t"

249 
PAVGB" 4(%2), %%mm1 \n\t"

250 
PAVGB" (%3), %%mm0 \n\t"

251 
"movd %%mm0, (%3) \n\t"

252 
"add %5, %3 \n\t"

253 
PAVGB" (%3), %%mm1 \n\t"

254 
"movd %%mm1, (%3) \n\t"

255 
"add %5, %3 \n\t"

256 
"movd (%1), %%mm0 \n\t"

257 
"add %4, %1 \n\t"

258 
"movd (%1), %%mm1 \n\t"

259 
"add %4, %1 \n\t"

260 
PAVGB" 8(%2), %%mm0 \n\t"

261 
PAVGB" 12(%2), %%mm1 \n\t"

262 
PAVGB" (%3), %%mm0 \n\t"

263 
"movd %%mm0, (%3) \n\t"

264 
"add %5, %3 \n\t"

265 
PAVGB" (%3), %%mm1 \n\t"

266 
"movd %%mm1, (%3) \n\t"

267 
"add %5, %3 \n\t"

268 
"add $16, %2 \n\t"

269 
"subl $4, %0 \n\t"

270 
"jnz 1b \n\t"

271 
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 
272 
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
273 
#else

274 
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
275 
#endif

276 
:"S"((long)src1Stride), "D"((long)dstStride) 
277 
:"memory");

278 
} 
279  
280  
281 
static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 
282 
{ 
283 
__asm __volatile( 
284 
"testl $1, %0 \n\t"

285 
" jz 1f \n\t"

286 
"movq (%1), %%mm0 \n\t"

287 
"movq (%2), %%mm1 \n\t"

288 
"add %4, %1 \n\t"

289 
"add $8, %2 \n\t"

290 
PAVGB" %%mm1, %%mm0 \n\t"

291 
PAVGB" (%3), %%mm0 \n\t"

292 
"movq %%mm0, (%3) \n\t"

293 
"add %5, %3 \n\t"

294 
"decl %0 \n\t"

295 
"1: \n\t"

296 
"movq (%1), %%mm0 \n\t"

297 
"add %4, %1 \n\t"

298 
"movq (%1), %%mm1 \n\t"

299 
"add %4, %1 \n\t"

300 
PAVGB" (%2), %%mm0 \n\t"

301 
PAVGB" 8(%2), %%mm1 \n\t"

302 
PAVGB" (%3), %%mm0 \n\t"

303 
"movq %%mm0, (%3) \n\t"

304 
"add %5, %3 \n\t"

305 
PAVGB" (%3), %%mm1 \n\t"

306 
"movq %%mm1, (%3) \n\t"

307 
"add %5, %3 \n\t"

308 
"movq (%1), %%mm0 \n\t"

309 
"add %4, %1 \n\t"

310 
"movq (%1), %%mm1 \n\t"

311 
"add %4, %1 \n\t"

312 
PAVGB" 16(%2), %%mm0 \n\t"

313 
PAVGB" 24(%2), %%mm1 \n\t"

314 
PAVGB" (%3), %%mm0 \n\t"

315 
"movq %%mm0, (%3) \n\t"

316 
"add %5, %3 \n\t"

317 
PAVGB" (%3), %%mm1 \n\t"

318 
"movq %%mm1, (%3) \n\t"

319 
"add %5, %3 \n\t"

320 
"add $32, %2 \n\t"

321 
"subl $4, %0 \n\t"

322 
"jnz 1b \n\t"

323 
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 
324 
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
325 
#else

326 
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
327 
#endif

328 
:"S"((long)src1Stride), "D"((long)dstStride) 
329 
:"memory");

330 
//the following should be used, though better not with gcc ...

331 
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)

332 
:"r"(src1Stride), "r"(dstStride)

333 
:"memory");*/

334 
} 
335  
336 
static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
337 
{ 
338 
__asm __volatile( 
339 
"lea (%3, %3), %%"REG_a" \n\t" 
340 
"1: \n\t"

341 
"movq (%1), %%mm0 \n\t"

342 
"movq (%1, %3), %%mm1 \n\t"

343 
"movq 8(%1), %%mm2 \n\t"

344 
"movq 8(%1, %3), %%mm3 \n\t"

345 
PAVGB" 1(%1), %%mm0 \n\t"

346 
PAVGB" 1(%1, %3), %%mm1 \n\t"

347 
PAVGB" 9(%1), %%mm2 \n\t"

348 
PAVGB" 9(%1, %3), %%mm3 \n\t"

349 
"movq %%mm0, (%2) \n\t"

350 
"movq %%mm1, (%2, %3) \n\t"

351 
"movq %%mm2, 8(%2) \n\t"

352 
"movq %%mm3, 8(%2, %3) \n\t"

353 
"add %%"REG_a", %1 \n\t" 
354 
"add %%"REG_a", %2 \n\t" 
355 
"movq (%1), %%mm0 \n\t"

356 
"movq (%1, %3), %%mm1 \n\t"

357 
"movq 8(%1), %%mm2 \n\t"

358 
"movq 8(%1, %3), %%mm3 \n\t"

359 
PAVGB" 1(%1), %%mm0 \n\t"

360 
PAVGB" 1(%1, %3), %%mm1 \n\t"

361 
PAVGB" 9(%1), %%mm2 \n\t"

362 
PAVGB" 9(%1, %3), %%mm3 \n\t"

363 
"add %%"REG_a", %1 \n\t" 
364 
"movq %%mm0, (%2) \n\t"

365 
"movq %%mm1, (%2, %3) \n\t"

366 
"movq %%mm2, 8(%2) \n\t"

367 
"movq %%mm3, 8(%2, %3) \n\t"

368 
"add %%"REG_a", %2 \n\t" 
369 
"subl $4, %0 \n\t"

370 
"jnz 1b \n\t"

371 
:"+g"(h), "+S"(pixels), "+D"(block) 
372 
:"r" ((long)line_size) 
373 
:"%"REG_a, "memory"); 
374 
} 
375  
376 
static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 
377 
{ 
378 
__asm __volatile( 
379 
"testl $1, %0 \n\t"

380 
" jz 1f \n\t"

381 
"movq (%1), %%mm0 \n\t"

382 
"movq 8(%1), %%mm1 \n\t"

383 
PAVGB" (%2), %%mm0 \n\t"

384 
PAVGB" 8(%2), %%mm1 \n\t"

385 
"add %4, %1 \n\t"

386 
"add $16, %2 \n\t"

387 
"movq %%mm0, (%3) \n\t"

388 
"movq %%mm1, 8(%3) \n\t"

389 
"add %5, %3 \n\t"

390 
"decl %0 \n\t"

391 
"1: \n\t"

392 
"movq (%1), %%mm0 \n\t"

393 
"movq 8(%1), %%mm1 \n\t"

394 
"add %4, %1 \n\t"

395 
PAVGB" (%2), %%mm0 \n\t"

396 
PAVGB" 8(%2), %%mm1 \n\t"

397 
"movq %%mm0, (%3) \n\t"

398 
"movq %%mm1, 8(%3) \n\t"

399 
"add %5, %3 \n\t"

400 
"movq (%1), %%mm0 \n\t"

401 
"movq 8(%1), %%mm1 \n\t"

402 
"add %4, %1 \n\t"

403 
PAVGB" 16(%2), %%mm0 \n\t"

404 
PAVGB" 24(%2), %%mm1 \n\t"

405 
"movq %%mm0, (%3) \n\t"

406 
"movq %%mm1, 8(%3) \n\t"

407 
"add %5, %3 \n\t"

408 
"add $32, %2 \n\t"

409 
"subl $2, %0 \n\t"

410 
"jnz 1b \n\t"

411 
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 
412 
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
413 
#else

414 
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
415 
#endif

416 
:"S"((long)src1Stride), "D"((long)dstStride) 
417 
:"memory");

418 
//the following should be used, though better not with gcc ...

419 
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)

420 
:"r"(src1Stride), "r"(dstStride)

421 
:"memory");*/

422 
} 
423  
424 
static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 
425 
{ 
426 
__asm __volatile( 
427 
"testl $1, %0 \n\t"

428 
" jz 1f \n\t"

429 
"movq (%1), %%mm0 \n\t"

430 
"movq 8(%1), %%mm1 \n\t"

431 
PAVGB" (%2), %%mm0 \n\t"

432 
PAVGB" 8(%2), %%mm1 \n\t"

433 
"add %4, %1 \n\t"

434 
"add $16, %2 \n\t"

435 
PAVGB" (%3), %%mm0 \n\t"

436 
PAVGB" 8(%3), %%mm1 \n\t"

437 
"movq %%mm0, (%3) \n\t"

438 
"movq %%mm1, 8(%3) \n\t"

439 
"add %5, %3 \n\t"

440 
"decl %0 \n\t"

441 
"1: \n\t"

442 
"movq (%1), %%mm0 \n\t"

443 
"movq 8(%1), %%mm1 \n\t"

444 
"add %4, %1 \n\t"

445 
PAVGB" (%2), %%mm0 \n\t"

446 
PAVGB" 8(%2), %%mm1 \n\t"

447 
PAVGB" (%3), %%mm0 \n\t"

448 
PAVGB" 8(%3), %%mm1 \n\t"

449 
"movq %%mm0, (%3) \n\t"

450 
"movq %%mm1, 8(%3) \n\t"

451 
"add %5, %3 \n\t"

452 
"movq (%1), %%mm0 \n\t"

453 
"movq 8(%1), %%mm1 \n\t"

454 
"add %4, %1 \n\t"

455 
PAVGB" 16(%2), %%mm0 \n\t"

456 
PAVGB" 24(%2), %%mm1 \n\t"

457 
PAVGB" (%3), %%mm0 \n\t"

458 
PAVGB" 8(%3), %%mm1 \n\t"

459 
"movq %%mm0, (%3) \n\t"

460 
"movq %%mm1, 8(%3) \n\t"

461 
"add %5, %3 \n\t"

462 
"add $32, %2 \n\t"

463 
"subl $2, %0 \n\t"

464 
"jnz 1b \n\t"

465 
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 
466 
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
467 
#else

468 
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
469 
#endif

470 
:"S"((long)src1Stride), "D"((long)dstStride) 
471 
:"memory");

472 
//the following should be used, though better not with gcc ...

473 
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)

474 
:"r"(src1Stride), "r"(dstStride)

475 
:"memory");*/

476 
} 
477  
478 
static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 
479 
{ 
480 
__asm __volatile( 
481 
"pcmpeqb %%mm6, %%mm6 \n\t"

482 
"testl $1, %0 \n\t"

483 
" jz 1f \n\t"

484 
"movq (%1), %%mm0 \n\t"

485 
"movq 8(%1), %%mm1 \n\t"

486 
"movq (%2), %%mm2 \n\t"

487 
"movq 8(%2), %%mm3 \n\t"

488 
"pxor %%mm6, %%mm0 \n\t"

489 
"pxor %%mm6, %%mm1 \n\t"

490 
"pxor %%mm6, %%mm2 \n\t"

491 
"pxor %%mm6, %%mm3 \n\t"

492 
PAVGB" %%mm2, %%mm0 \n\t"

493 
PAVGB" %%mm3, %%mm1 \n\t"

494 
"pxor %%mm6, %%mm0 \n\t"

495 
"pxor %%mm6, %%mm1 \n\t"

496 
"add %4, %1 \n\t"

497 
"add $16, %2 \n\t"

498 
"movq %%mm0, (%3) \n\t"

499 
"movq %%mm1, 8(%3) \n\t"

500 
"add %5, %3 \n\t"

501 
"decl %0 \n\t"

502 
"1: \n\t"

503 
"movq (%1), %%mm0 \n\t"

504 
"movq 8(%1), %%mm1 \n\t"

505 
"add %4, %1 \n\t"

506 
"movq (%2), %%mm2 \n\t"

507 
"movq 8(%2), %%mm3 \n\t"

508 
"pxor %%mm6, %%mm0 \n\t"

509 
"pxor %%mm6, %%mm1 \n\t"

510 
"pxor %%mm6, %%mm2 \n\t"

511 
"pxor %%mm6, %%mm3 \n\t"

512 
PAVGB" %%mm2, %%mm0 \n\t"

513 
PAVGB" %%mm3, %%mm1 \n\t"

514 
"pxor %%mm6, %%mm0 \n\t"

515 
"pxor %%mm6, %%mm1 \n\t"

516 
"movq %%mm0, (%3) \n\t"

517 
"movq %%mm1, 8(%3) \n\t"

518 
"add %5, %3 \n\t"

519 
"movq (%1), %%mm0 \n\t"

520 
"movq 8(%1), %%mm1 \n\t"

521 
"add %4, %1 \n\t"

522 
"movq 16(%2), %%mm2 \n\t"

523 
"movq 24(%2), %%mm3 \n\t"

524 
"pxor %%mm6, %%mm0 \n\t"

525 
"pxor %%mm6, %%mm1 \n\t"

526 
"pxor %%mm6, %%mm2 \n\t"

527 
"pxor %%mm6, %%mm3 \n\t"

528 
PAVGB" %%mm2, %%mm0 \n\t"

529 
PAVGB" %%mm3, %%mm1 \n\t"

530 
"pxor %%mm6, %%mm0 \n\t"

531 
"pxor %%mm6, %%mm1 \n\t"

532 
"movq %%mm0, (%3) \n\t"

533 
"movq %%mm1, 8(%3) \n\t"

534 
"add %5, %3 \n\t"

535 
"add $32, %2 \n\t"

536 
"subl $2, %0 \n\t"

537 
"jnz 1b \n\t"

538 
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used 
539 
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
540 
#else

541 
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
542 
#endif

543 
:"S"((long)src1Stride), "D"((long)dstStride) 
544 
:"memory");

545 
//the following should be used, though better not with gcc ...

546 
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)

547 
:"r"(src1Stride), "r"(dstStride)

548 
:"memory");*/

549 
} 
550  
551 
/* GL: this function does incorrect rounding if overflow */

552 
static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
553 
{ 
554 
MOVQ_BONE(mm6); 
555 
__asm __volatile( 
556 
"lea (%3, %3), %%"REG_a" \n\t" 
557 
"1: \n\t"

558 
"movq (%1), %%mm0 \n\t"

559 
"movq (%1, %3), %%mm2 \n\t"

560 
"movq 1(%1), %%mm1 \n\t"

561 
"movq 1(%1, %3), %%mm3 \n\t"

562 
"add %%"REG_a", %1 \n\t" 
563 
"psubusb %%mm6, %%mm0 \n\t"

564 
"psubusb %%mm6, %%mm2 \n\t"

565 
PAVGB" %%mm1, %%mm0 \n\t"

566 
PAVGB" %%mm3, %%mm2 \n\t"

567 
"movq %%mm0, (%2) \n\t"

568 
"movq %%mm2, (%2, %3) \n\t"

569 
"movq (%1), %%mm0 \n\t"

570 
"movq 1(%1), %%mm1 \n\t"

571 
"movq (%1, %3), %%mm2 \n\t"

572 
"movq 1(%1, %3), %%mm3 \n\t"

573 
"add %%"REG_a", %2 \n\t" 
574 
"add %%"REG_a", %1 \n\t" 
575 
"psubusb %%mm6, %%mm0 \n\t"

576 
"psubusb %%mm6, %%mm2 \n\t"

577 
PAVGB" %%mm1, %%mm0 \n\t"

578 
PAVGB" %%mm3, %%mm2 \n\t"

579 
"movq %%mm0, (%2) \n\t"

580 
"movq %%mm2, (%2, %3) \n\t"

581 
"add %%"REG_a", %2 \n\t" 
582 
"subl $4, %0 \n\t"

583 
"jnz 1b \n\t"

584 
:"+g"(h), "+S"(pixels), "+D"(block) 
585 
:"r" ((long)line_size) 
586 
:"%"REG_a, "memory"); 
587 
} 
588  
589 
static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
590 
{ 
591 
__asm __volatile( 
592 
"lea (%3, %3), %%"REG_a" \n\t" 
593 
"movq (%1), %%mm0 \n\t"

594 
"sub %3, %2 \n\t"

595 
"1: \n\t"

596 
"movq (%1, %3), %%mm1 \n\t"

597 
"movq (%1, %%"REG_a"), %%mm2 \n\t" 
598 
"add %%"REG_a", %1 \n\t" 
599 
PAVGB" %%mm1, %%mm0 \n\t"

600 
PAVGB" %%mm2, %%mm1 \n\t"

601 
"movq %%mm0, (%2, %3) \n\t"

602 
"movq %%mm1, (%2, %%"REG_a") \n\t" 
603 
"movq (%1, %3), %%mm1 \n\t"

604 
"movq (%1, %%"REG_a"), %%mm0 \n\t" 
605 
"add %%"REG_a", %2 \n\t" 
606 
"add %%"REG_a", %1 \n\t" 
607 
PAVGB" %%mm1, %%mm2 \n\t"

608 
PAVGB" %%mm0, %%mm1 \n\t"

609 
"movq %%mm2, (%2, %3) \n\t"

610 
"movq %%mm1, (%2, %%"REG_a") \n\t" 
611 
"add %%"REG_a", %2 \n\t" 
612 
"subl $4, %0 \n\t"

613 
"jnz 1b \n\t"

614 
:"+g"(h), "+S"(pixels), "+D" (block) 
615 
:"r" ((long)line_size) 
616 
:"%"REG_a, "memory"); 
617 
} 
618  
619 
/* GL: this function does incorrect rounding if overflow */

620 
static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
621 
{ 
622 
MOVQ_BONE(mm6); 
623 
__asm __volatile( 
624 
"lea (%3, %3), %%"REG_a" \n\t" 
625 
"movq (%1), %%mm0 \n\t"

626 
"sub %3, %2 \n\t"

627 
"1: \n\t"

628 
"movq (%1, %3), %%mm1 \n\t"

629 
"movq (%1, %%"REG_a"), %%mm2 \n\t" 
630 
"add %%"REG_a", %1 \n\t" 
631 
"psubusb %%mm6, %%mm1 \n\t"

632 
PAVGB" %%mm1, %%mm0 \n\t"

633 
PAVGB" %%mm2, %%mm1 \n\t"

634 
"movq %%mm0, (%2, %3) \n\t"

635 
"movq %%mm1, (%2, %%"REG_a") \n\t" 
636 
"movq (%1, %3), %%mm1 \n\t"

637 
"movq (%1, %%"REG_a"), %%mm0 \n\t" 
638 
"add %%"REG_a", %2 \n\t" 
639 
"add %%"REG_a", %1 \n\t" 
640 
"psubusb %%mm6, %%mm1 \n\t"

641 
PAVGB" %%mm1, %%mm2 \n\t"

642 
PAVGB" %%mm0, %%mm1 \n\t"

643 
"movq %%mm2, (%2, %3) \n\t"

644 
"movq %%mm1, (%2, %%"REG_a") \n\t" 
645 
"add %%"REG_a", %2 \n\t" 
646 
"subl $4, %0 \n\t"

647 
"jnz 1b \n\t"

648 
:"+g"(h), "+S"(pixels), "+D" (block) 
649 
:"r" ((long)line_size) 
650 
:"%"REG_a, "memory"); 
651 
} 
652  
653 
static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
654 
{ 
655 
__asm __volatile( 
656 
"lea (%3, %3), %%"REG_a" \n\t" 
657 
"1: \n\t"

658 
"movq (%2), %%mm0 \n\t"

659 
"movq (%2, %3), %%mm1 \n\t"

660 
PAVGB" (%1), %%mm0 \n\t"

661 
PAVGB" (%1, %3), %%mm1 \n\t"

662 
"movq %%mm0, (%2) \n\t"

663 
"movq %%mm1, (%2, %3) \n\t"

664 
"add %%"REG_a", %1 \n\t" 
665 
"add %%"REG_a", %2 \n\t" 
666 
"movq (%2), %%mm0 \n\t"

667 
"movq (%2, %3), %%mm1 \n\t"

668 
PAVGB" (%1), %%mm0 \n\t"

669 
PAVGB" (%1, %3), %%mm1 \n\t"

670 
"add %%"REG_a", %1 \n\t" 
671 
"movq %%mm0, (%2) \n\t"

672 
"movq %%mm1, (%2, %3) \n\t"

673 
"add %%"REG_a", %2 \n\t" 
674 
"subl $4, %0 \n\t"

675 
"jnz 1b \n\t"

676 
:"+g"(h), "+S"(pixels), "+D"(block) 
677 
:"r" ((long)line_size) 
678 
:"%"REG_a, "memory"); 
679 
} 
680  
681 
static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
682 
{ 
683 
__asm __volatile( 
684 
"lea (%3, %3), %%"REG_a" \n\t" 
685 
"1: \n\t"

686 
"movq (%1), %%mm0 \n\t"

687 
"movq (%1, %3), %%mm2 \n\t"

688 
PAVGB" 1(%1), %%mm0 \n\t"

689 
PAVGB" 1(%1, %3), %%mm2 \n\t"

690 
PAVGB" (%2), %%mm0 \n\t"

691 
PAVGB" (%2, %3), %%mm2 \n\t"

692 
"add %%"REG_a", %1 \n\t" 
693 
"movq %%mm0, (%2) \n\t"

694 
"movq %%mm2, (%2, %3) \n\t"

695 
"movq (%1), %%mm0 \n\t"

696 
"movq (%1, %3), %%mm2 \n\t"

697 
PAVGB" 1(%1), %%mm0 \n\t"

698 
PAVGB" 1(%1, %3), %%mm2 \n\t"

699 
"add %%"REG_a", %2 \n\t" 
700 
"add %%"REG_a", %1 \n\t" 
701 
PAVGB" (%2), %%mm0 \n\t"

702 
PAVGB" (%2, %3), %%mm2 \n\t"

703 
"movq %%mm0, (%2) \n\t"

704 
"movq %%mm2, (%2, %3) \n\t"

705 
"add %%"REG_a", %2 \n\t" 
706 
"subl $4, %0 \n\t"

707 
"jnz 1b \n\t"

708 
:"+g"(h), "+S"(pixels), "+D"(block) 
709 
:"r" ((long)line_size) 
710 
:"%"REG_a, "memory"); 
711 
} 
712  
713 
static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
714 
{ 
715 
__asm __volatile( 
716 
"lea (%3, %3), %%"REG_a" \n\t" 
717 
"movq (%1), %%mm0 \n\t"

718 
"sub %3, %2 \n\t"

719 
"1: \n\t"

720 
"movq (%1, %3), %%mm1 \n\t"

721 
"movq (%1, %%"REG_a"), %%mm2 \n\t" 
722 
"add %%"REG_a", %1 \n\t" 
723 
PAVGB" %%mm1, %%mm0 \n\t"

724 
PAVGB" %%mm2, %%mm1 \n\t"

725 
"movq (%2, %3), %%mm3 \n\t"

726 
"movq (%2, %%"REG_a"), %%mm4 \n\t" 
727 
PAVGB" %%mm3, %%mm0 \n\t"

728 
PAVGB" %%mm4, %%mm1 \n\t"

729 
"movq %%mm0, (%2, %3) \n\t"

730 
"movq %%mm1, (%2, %%"REG_a") \n\t" 
731 
"movq (%1, %3), %%mm1 \n\t"

732 
"movq (%1, %%"REG_a"), %%mm0 \n\t" 
733 
PAVGB" %%mm1, %%mm2 \n\t"

734 
PAVGB" %%mm0, %%mm1 \n\t"

735 
"add %%"REG_a", %2 \n\t" 
736 
"add %%"REG_a", %1 \n\t" 
737 
"movq (%2, %3), %%mm3 \n\t"

738 
"movq (%2, %%"REG_a"), %%mm4 \n\t" 
739 
PAVGB" %%mm3, %%mm2 \n\t"

740 
PAVGB" %%mm4, %%mm1 \n\t"

741 
"movq %%mm2, (%2, %3) \n\t"

742 
"movq %%mm1, (%2, %%"REG_a") \n\t" 
743 
"add %%"REG_a", %2 \n\t" 
744 
"subl $4, %0 \n\t"

745 
"jnz 1b \n\t"

746 
:"+g"(h), "+S"(pixels), "+D"(block) 
747 
:"r" ((long)line_size) 
748 
:"%"REG_a, "memory"); 
749 
} 
750  
751 
/* Note this is not correctly rounded, but this function is only

752 
* used for Bframes so it does not matter. */

753 
static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
754 
{ 
755 
MOVQ_BONE(mm6); 
756 
__asm __volatile( 
757 
"lea (%3, %3), %%"REG_a" \n\t" 
758 
"movq (%1), %%mm0 \n\t"

759 
PAVGB" 1(%1), %%mm0 \n\t"

760 
ASMALIGN(3)

761 
"1: \n\t"

762 
"movq (%1, %%"REG_a"), %%mm2 \n\t" 
763 
"movq (%1, %3), %%mm1 \n\t"

764 
"psubusb %%mm6, %%mm2 \n\t"

765 
PAVGB" 1(%1, %3), %%mm1 \n\t"

766 
PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t" 
767 
"add %%"REG_a", %1 \n\t" 
768 
PAVGB" %%mm1, %%mm0 \n\t"

769 
PAVGB" %%mm2, %%mm1 \n\t"

770 
PAVGB" (%2), %%mm0 \n\t"

771 
PAVGB" (%2, %3), %%mm1 \n\t"

772 
"movq %%mm0, (%2) \n\t"

773 
"movq %%mm1, (%2, %3) \n\t"

774 
"movq (%1, %3), %%mm1 \n\t"

775 
"movq (%1, %%"REG_a"), %%mm0 \n\t" 
776 
PAVGB" 1(%1, %3), %%mm1 \n\t"

777 
PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t" 
778 
"add %%"REG_a", %2 \n\t" 
779 
"add %%"REG_a", %1 \n\t" 
780 
PAVGB" %%mm1, %%mm2 \n\t"

781 
PAVGB" %%mm0, %%mm1 \n\t"

782 
PAVGB" (%2), %%mm2 \n\t"

783 
PAVGB" (%2, %3), %%mm1 \n\t"

784 
"movq %%mm2, (%2) \n\t"

785 
"movq %%mm1, (%2, %3) \n\t"

786 
"add %%"REG_a", %2 \n\t" 
787 
"subl $4, %0 \n\t"

788 
"jnz 1b \n\t"

789 
:"+g"(h), "+S"(pixels), "+D"(block) 
790 
:"r" ((long)line_size) 
791 
:"%"REG_a, "memory"); 
792 
} 
793  
794 
//FIXME the following could be optimized too ...

795 
static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 
796 
DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); 
797 
DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); 
798 
} 
799 
static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 
800 
DEF(put_pixels8_y2)(block , pixels , line_size, h); 
801 
DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h); 
802 
} 
803 
static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 
804 
DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h); 
805 
DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); 
806 
} 
807 
static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 
808 
DEF(avg_pixels8)(block , pixels , line_size, h); 
809 
DEF(avg_pixels8)(block+8, pixels+8, line_size, h); 
810 
} 
811 
static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 
812 
DEF(avg_pixels8_x2)(block , pixels , line_size, h); 
813 
DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h); 
814 
} 
815 
static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 
816 
DEF(avg_pixels8_y2)(block , pixels , line_size, h); 
817 
DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h); 
818 
} 
819 
static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 
820 
DEF(avg_pixels8_xy2)(block , pixels , line_size, h); 
821 
DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); 
822 
} 
823  
824 
#define QPEL_2TAP_L3(OPNAME) \

825 
static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\ 
826 
asm volatile(\ 
827 
"1: \n\t"\

828 
"movq (%1,%2), %%mm0 \n\t"\

829 
"movq 8(%1,%2), %%mm1 \n\t"\

830 
PAVGB" (%1,%3), %%mm0 \n\t"\

831 
PAVGB" 8(%1,%3), %%mm1 \n\t"\

832 
PAVGB" (%1), %%mm0 \n\t"\

833 
PAVGB" 8(%1), %%mm1 \n\t"\

834 
STORE_OP( (%1,%4),%%mm0)\ 
835 
STORE_OP(8(%1,%4),%%mm1)\ 
836 
"movq %%mm0, (%1,%4) \n\t"\

837 
"movq %%mm1, 8(%1,%4) \n\t"\

838 
"add %5, %1 \n\t"\

839 
"decl %0 \n\t"\

840 
"jnz 1b \n\t"\

841 
:"+g"(h), "+r"(src)\ 
842 
:"r"((long)off1), "r"((long)off2),\ 
843 
"r"((long)(dstsrc)), "r"((long)stride)\ 
844 
:"memory"\

845 
);\ 
846 
}\ 
847 
static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\ 
848 
asm volatile(\ 
849 
"1: \n\t"\

850 
"movq (%1,%2), %%mm0 \n\t"\

851 
PAVGB" (%1,%3), %%mm0 \n\t"\

852 
PAVGB" (%1), %%mm0 \n\t"\

853 
STORE_OP((%1,%4),%%mm0)\ 
854 
"movq %%mm0, (%1,%4) \n\t"\

855 
"add %5, %1 \n\t"\

856 
"decl %0 \n\t"\

857 
"jnz 1b \n\t"\

858 
:"+g"(h), "+r"(src)\ 
859 
:"r"((long)off1), "r"((long)off2),\ 
860 
"r"((long)(dstsrc)), "r"((long)stride)\ 
861 
:"memory"\

862 
);\ 
863 
} 
864  
865 
#define STORE_OP(a,b) PAVGB" "#a","#b" \n\t" 
866 
QPEL_2TAP_L3(avg_) 
867 
#undef STORE_OP

868 
#define STORE_OP(a,b)

869 
QPEL_2TAP_L3(put_) 
870 
#undef STORE_OP

871 
#undef QPEL_2TAP_L3
