ffmpeg / libavcodec / i386 / dsputil_mmx_avg.h @ 5509bffa
History  View  Annotate  Download (34.7 KB)
1 
/*


2 
* DSP utils : average functions are compiled twice for 3dnow/mmx2

3 
* Copyright (c) 2000, 2001 Fabrice Bellard.

4 
* Copyright (c) 20022004 Michael Niedermayer

5 
*

6 
* This library is free software; you can redistribute it and/or

7 
* modify it under the terms of the GNU Lesser General Public

8 
* License as published by the Free Software Foundation; either

9 
* version 2 of the License, or (at your option) any later version.

10 
*

11 
* This library is distributed in the hope that it will be useful,

12 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

13 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

14 
* Lesser General Public License for more details.

15 
*

16 
* You should have received a copy of the GNU Lesser General Public

17 
* License along with this library; if not, write to the Free Software

18 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

19 
*

20 
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>

21 
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>

22 
* and improved by Zdenek Kabelac <kabi@users.sf.net>

23 
*/

24  
25 
/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm

26 
clobber bug  now it will work with 2.95.2 and also with fPIC

27 
*/

28 
static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
29 
{ 
30 
__asm __volatile( 
31 
"lea (%3, %3), %%"REG_a" \n\t" 
32 
"1: \n\t"

33 
"movq (%1), %%mm0 \n\t"

34 
"movq (%1, %3), %%mm1 \n\t"

35 
PAVGB" 1(%1), %%mm0 \n\t"

36 
PAVGB" 1(%1, %3), %%mm1 \n\t"

37 
"movq %%mm0, (%2) \n\t"

38 
"movq %%mm1, (%2, %3) \n\t"

39 
"add %%"REG_a", %1 \n\t" 
40 
"add %%"REG_a", %2 \n\t" 
41 
"movq (%1), %%mm0 \n\t"

42 
"movq (%1, %3), %%mm1 \n\t"

43 
PAVGB" 1(%1), %%mm0 \n\t"

44 
PAVGB" 1(%1, %3), %%mm1 \n\t"

45 
"add %%"REG_a", %1 \n\t" 
46 
"movq %%mm0, (%2) \n\t"

47 
"movq %%mm1, (%2, %3) \n\t"

48 
"add %%"REG_a", %2 \n\t" 
49 
"subl $4, %0 \n\t"

50 
"jnz 1b \n\t"

51 
:"+g"(h), "+S"(pixels), "+D"(block) 
52 
:"r" ((long)line_size) 
53 
:"%"REG_a, "memory"); 
54 
} 
55  
56 
static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 
57 
{ 
58 
__asm __volatile( 
59 
"testl $1, %0 \n\t"

60 
" jz 1f \n\t"

61 
"movd (%1), %%mm0 \n\t"

62 
"movd (%2), %%mm1 \n\t"

63 
"add %4, %1 \n\t"

64 
"add $4, %2 \n\t"

65 
PAVGB" %%mm1, %%mm0 \n\t"

66 
"movd %%mm0, (%3) \n\t"

67 
"add %5, %3 \n\t"

68 
"decl %0 \n\t"

69 
"1: \n\t"

70 
"movd (%1), %%mm0 \n\t"

71 
"add %4, %1 \n\t"

72 
"movd (%1), %%mm1 \n\t"

73 
"movd (%2), %%mm2 \n\t"

74 
"movd 4(%2), %%mm3 \n\t"

75 
"add %4, %1 \n\t"

76 
PAVGB" %%mm2, %%mm0 \n\t"

77 
PAVGB" %%mm3, %%mm1 \n\t"

78 
"movd %%mm0, (%3) \n\t"

79 
"add %5, %3 \n\t"

80 
"movd %%mm1, (%3) \n\t"

81 
"add %5, %3 \n\t"

82 
"movd (%1), %%mm0 \n\t"

83 
"add %4, %1 \n\t"

84 
"movd (%1), %%mm1 \n\t"

85 
"movd 8(%2), %%mm2 \n\t"

86 
"movd 12(%2), %%mm3 \n\t"

87 
"add %4, %1 \n\t"

88 
PAVGB" %%mm2, %%mm0 \n\t"

89 
PAVGB" %%mm3, %%mm1 \n\t"

90 
"movd %%mm0, (%3) \n\t"

91 
"add %5, %3 \n\t"

92 
"movd %%mm1, (%3) \n\t"

93 
"add %5, %3 \n\t"

94 
"add $16, %2 \n\t"

95 
"subl $4, %0 \n\t"

96 
"jnz 1b \n\t"

97 
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used 
98 
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
99 
#else

100 
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
101 
#endif

102 
:"S"((long)src1Stride), "D"((long)dstStride) 
103 
:"memory");

104 
} 
105  
106  
107 
static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 
108 
{ 
109 
__asm __volatile( 
110 
"testl $1, %0 \n\t"

111 
" jz 1f \n\t"

112 
"movq (%1), %%mm0 \n\t"

113 
"movq (%2), %%mm1 \n\t"

114 
"add %4, %1 \n\t"

115 
"add $8, %2 \n\t"

116 
PAVGB" %%mm1, %%mm0 \n\t"

117 
"movq %%mm0, (%3) \n\t"

118 
"add %5, %3 \n\t"

119 
"decl %0 \n\t"

120 
"1: \n\t"

121 
"movq (%1), %%mm0 \n\t"

122 
"add %4, %1 \n\t"

123 
"movq (%1), %%mm1 \n\t"

124 
"add %4, %1 \n\t"

125 
PAVGB" (%2), %%mm0 \n\t"

126 
PAVGB" 8(%2), %%mm1 \n\t"

127 
"movq %%mm0, (%3) \n\t"

128 
"add %5, %3 \n\t"

129 
"movq %%mm1, (%3) \n\t"

130 
"add %5, %3 \n\t"

131 
"movq (%1), %%mm0 \n\t"

132 
"add %4, %1 \n\t"

133 
"movq (%1), %%mm1 \n\t"

134 
"add %4, %1 \n\t"

135 
PAVGB" 16(%2), %%mm0 \n\t"

136 
PAVGB" 24(%2), %%mm1 \n\t"

137 
"movq %%mm0, (%3) \n\t"

138 
"add %5, %3 \n\t"

139 
"movq %%mm1, (%3) \n\t"

140 
"add %5, %3 \n\t"

141 
"add $32, %2 \n\t"

142 
"subl $4, %0 \n\t"

143 
"jnz 1b \n\t"

144 
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used 
145 
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
146 
#else

147 
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
148 
#endif

149 
:"S"((long)src1Stride), "D"((long)dstStride) 
150 
:"memory");

151 
//the following should be used, though better not with gcc ...

152 
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)

153 
:"r"(src1Stride), "r"(dstStride)

154 
:"memory");*/

155 
} 
156  
157 
static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 
158 
{ 
159 
__asm __volatile( 
160 
"pcmpeqb %%mm6, %%mm6 \n\t"

161 
"testl $1, %0 \n\t"

162 
" jz 1f \n\t"

163 
"movq (%1), %%mm0 \n\t"

164 
"movq (%2), %%mm1 \n\t"

165 
"add %4, %1 \n\t"

166 
"add $8, %2 \n\t"

167 
"pxor %%mm6, %%mm0 \n\t"

168 
"pxor %%mm6, %%mm1 \n\t"

169 
PAVGB" %%mm1, %%mm0 \n\t"

170 
"pxor %%mm6, %%mm0 \n\t"

171 
"movq %%mm0, (%3) \n\t"

172 
"add %5, %3 \n\t"

173 
"decl %0 \n\t"

174 
"1: \n\t"

175 
"movq (%1), %%mm0 \n\t"

176 
"add %4, %1 \n\t"

177 
"movq (%1), %%mm1 \n\t"

178 
"add %4, %1 \n\t"

179 
"movq (%2), %%mm2 \n\t"

180 
"movq 8(%2), %%mm3 \n\t"

181 
"pxor %%mm6, %%mm0 \n\t"

182 
"pxor %%mm6, %%mm1 \n\t"

183 
"pxor %%mm6, %%mm2 \n\t"

184 
"pxor %%mm6, %%mm3 \n\t"

185 
PAVGB" %%mm2, %%mm0 \n\t"

186 
PAVGB" %%mm3, %%mm1 \n\t"

187 
"pxor %%mm6, %%mm0 \n\t"

188 
"pxor %%mm6, %%mm1 \n\t"

189 
"movq %%mm0, (%3) \n\t"

190 
"add %5, %3 \n\t"

191 
"movq %%mm1, (%3) \n\t"

192 
"add %5, %3 \n\t"

193 
"movq (%1), %%mm0 \n\t"

194 
"add %4, %1 \n\t"

195 
"movq (%1), %%mm1 \n\t"

196 
"add %4, %1 \n\t"

197 
"movq 16(%2), %%mm2 \n\t"

198 
"movq 24(%2), %%mm3 \n\t"

199 
"pxor %%mm6, %%mm0 \n\t"

200 
"pxor %%mm6, %%mm1 \n\t"

201 
"pxor %%mm6, %%mm2 \n\t"

202 
"pxor %%mm6, %%mm3 \n\t"

203 
PAVGB" %%mm2, %%mm0 \n\t"

204 
PAVGB" %%mm3, %%mm1 \n\t"

205 
"pxor %%mm6, %%mm0 \n\t"

206 
"pxor %%mm6, %%mm1 \n\t"

207 
"movq %%mm0, (%3) \n\t"

208 
"add %5, %3 \n\t"

209 
"movq %%mm1, (%3) \n\t"

210 
"add %5, %3 \n\t"

211 
"add $32, %2 \n\t"

212 
"subl $4, %0 \n\t"

213 
"jnz 1b \n\t"

214 
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used 
215 
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
216 
#else

217 
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
218 
#endif

219 
:"S"((long)src1Stride), "D"((long)dstStride) 
220 
:"memory");

221 
//the following should be used, though better not with gcc ...

222 
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)

223 
:"r"(src1Stride), "r"(dstStride)

224 
:"memory");*/

225 
} 
226  
227 
static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 
228 
{ 
229 
__asm __volatile( 
230 
"testl $1, %0 \n\t"

231 
" jz 1f \n\t"

232 
"movd (%1), %%mm0 \n\t"

233 
"movd (%2), %%mm1 \n\t"

234 
"add %4, %1 \n\t"

235 
"add $4, %2 \n\t"

236 
PAVGB" %%mm1, %%mm0 \n\t"

237 
PAVGB" (%3), %%mm0 \n\t"

238 
"movd %%mm0, (%3) \n\t"

239 
"add %5, %3 \n\t"

240 
"decl %0 \n\t"

241 
"1: \n\t"

242 
"movd (%1), %%mm0 \n\t"

243 
"add %4, %1 \n\t"

244 
"movd (%1), %%mm1 \n\t"

245 
"add %4, %1 \n\t"

246 
PAVGB" (%2), %%mm0 \n\t"

247 
PAVGB" 4(%2), %%mm1 \n\t"

248 
PAVGB" (%3), %%mm0 \n\t"

249 
"movd %%mm0, (%3) \n\t"

250 
"add %5, %3 \n\t"

251 
PAVGB" (%3), %%mm1 \n\t"

252 
"movd %%mm1, (%3) \n\t"

253 
"add %5, %3 \n\t"

254 
"movd (%1), %%mm0 \n\t"

255 
"add %4, %1 \n\t"

256 
"movd (%1), %%mm1 \n\t"

257 
"add %4, %1 \n\t"

258 
PAVGB" 8(%2), %%mm0 \n\t"

259 
PAVGB" 12(%2), %%mm1 \n\t"

260 
PAVGB" (%3), %%mm0 \n\t"

261 
"movd %%mm0, (%3) \n\t"

262 
"add %5, %3 \n\t"

263 
PAVGB" (%3), %%mm1 \n\t"

264 
"movd %%mm1, (%3) \n\t"

265 
"add %5, %3 \n\t"

266 
"add $16, %2 \n\t"

267 
"subl $4, %0 \n\t"

268 
"jnz 1b \n\t"

269 
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used 
270 
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
271 
#else

272 
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
273 
#endif

274 
:"S"((long)src1Stride), "D"((long)dstStride) 
275 
:"memory");

276 
} 
277  
278  
279 
static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 
280 
{ 
281 
__asm __volatile( 
282 
"testl $1, %0 \n\t"

283 
" jz 1f \n\t"

284 
"movq (%1), %%mm0 \n\t"

285 
"movq (%2), %%mm1 \n\t"

286 
"add %4, %1 \n\t"

287 
"add $8, %2 \n\t"

288 
PAVGB" %%mm1, %%mm0 \n\t"

289 
PAVGB" (%3), %%mm0 \n\t"

290 
"movq %%mm0, (%3) \n\t"

291 
"add %5, %3 \n\t"

292 
"decl %0 \n\t"

293 
"1: \n\t"

294 
"movq (%1), %%mm0 \n\t"

295 
"add %4, %1 \n\t"

296 
"movq (%1), %%mm1 \n\t"

297 
"add %4, %1 \n\t"

298 
PAVGB" (%2), %%mm0 \n\t"

299 
PAVGB" 8(%2), %%mm1 \n\t"

300 
PAVGB" (%3), %%mm0 \n\t"

301 
"movq %%mm0, (%3) \n\t"

302 
"add %5, %3 \n\t"

303 
PAVGB" (%3), %%mm1 \n\t"

304 
"movq %%mm1, (%3) \n\t"

305 
"add %5, %3 \n\t"

306 
"movq (%1), %%mm0 \n\t"

307 
"add %4, %1 \n\t"

308 
"movq (%1), %%mm1 \n\t"

309 
"add %4, %1 \n\t"

310 
PAVGB" 16(%2), %%mm0 \n\t"

311 
PAVGB" 24(%2), %%mm1 \n\t"

312 
PAVGB" (%3), %%mm0 \n\t"

313 
"movq %%mm0, (%3) \n\t"

314 
"add %5, %3 \n\t"

315 
PAVGB" (%3), %%mm1 \n\t"

316 
"movq %%mm1, (%3) \n\t"

317 
"add %5, %3 \n\t"

318 
"add $32, %2 \n\t"

319 
"subl $4, %0 \n\t"

320 
"jnz 1b \n\t"

321 
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used 
322 
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
323 
#else

324 
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
325 
#endif

326 
:"S"((long)src1Stride), "D"((long)dstStride) 
327 
:"memory");

328 
//the following should be used, though better not with gcc ...

329 
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)

330 
:"r"(src1Stride), "r"(dstStride)

331 
:"memory");*/

332 
} 
333  
334 
static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
335 
{ 
336 
__asm __volatile( 
337 
"lea (%3, %3), %%"REG_a" \n\t" 
338 
"1: \n\t"

339 
"movq (%1), %%mm0 \n\t"

340 
"movq (%1, %3), %%mm1 \n\t"

341 
"movq 8(%1), %%mm2 \n\t"

342 
"movq 8(%1, %3), %%mm3 \n\t"

343 
PAVGB" 1(%1), %%mm0 \n\t"

344 
PAVGB" 1(%1, %3), %%mm1 \n\t"

345 
PAVGB" 9(%1), %%mm2 \n\t"

346 
PAVGB" 9(%1, %3), %%mm3 \n\t"

347 
"movq %%mm0, (%2) \n\t"

348 
"movq %%mm1, (%2, %3) \n\t"

349 
"movq %%mm2, 8(%2) \n\t"

350 
"movq %%mm3, 8(%2, %3) \n\t"

351 
"add %%"REG_a", %1 \n\t" 
352 
"add %%"REG_a", %2 \n\t" 
353 
"movq (%1), %%mm0 \n\t"

354 
"movq (%1, %3), %%mm1 \n\t"

355 
"movq 8(%1), %%mm2 \n\t"

356 
"movq 8(%1, %3), %%mm3 \n\t"

357 
PAVGB" 1(%1), %%mm0 \n\t"

358 
PAVGB" 1(%1, %3), %%mm1 \n\t"

359 
PAVGB" 9(%1), %%mm2 \n\t"

360 
PAVGB" 9(%1, %3), %%mm3 \n\t"

361 
"add %%"REG_a", %1 \n\t" 
362 
"movq %%mm0, (%2) \n\t"

363 
"movq %%mm1, (%2, %3) \n\t"

364 
"movq %%mm2, 8(%2) \n\t"

365 
"movq %%mm3, 8(%2, %3) \n\t"

366 
"add %%"REG_a", %2 \n\t" 
367 
"subl $4, %0 \n\t"

368 
"jnz 1b \n\t"

369 
:"+g"(h), "+S"(pixels), "+D"(block) 
370 
:"r" ((long)line_size) 
371 
:"%"REG_a, "memory"); 
372 
} 
373  
374 
static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 
375 
{ 
376 
__asm __volatile( 
377 
"testl $1, %0 \n\t"

378 
" jz 1f \n\t"

379 
"movq (%1), %%mm0 \n\t"

380 
"movq 8(%1), %%mm1 \n\t"

381 
PAVGB" (%2), %%mm0 \n\t"

382 
PAVGB" 8(%2), %%mm1 \n\t"

383 
"add %4, %1 \n\t"

384 
"add $16, %2 \n\t"

385 
"movq %%mm0, (%3) \n\t"

386 
"movq %%mm1, 8(%3) \n\t"

387 
"add %5, %3 \n\t"

388 
"decl %0 \n\t"

389 
"1: \n\t"

390 
"movq (%1), %%mm0 \n\t"

391 
"movq 8(%1), %%mm1 \n\t"

392 
"add %4, %1 \n\t"

393 
PAVGB" (%2), %%mm0 \n\t"

394 
PAVGB" 8(%2), %%mm1 \n\t"

395 
"movq %%mm0, (%3) \n\t"

396 
"movq %%mm1, 8(%3) \n\t"

397 
"add %5, %3 \n\t"

398 
"movq (%1), %%mm0 \n\t"

399 
"movq 8(%1), %%mm1 \n\t"

400 
"add %4, %1 \n\t"

401 
PAVGB" 16(%2), %%mm0 \n\t"

402 
PAVGB" 24(%2), %%mm1 \n\t"

403 
"movq %%mm0, (%3) \n\t"

404 
"movq %%mm1, 8(%3) \n\t"

405 
"add %5, %3 \n\t"

406 
"add $32, %2 \n\t"

407 
"subl $2, %0 \n\t"

408 
"jnz 1b \n\t"

409 
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used 
410 
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
411 
#else

412 
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
413 
#endif

414 
:"S"((long)src1Stride), "D"((long)dstStride) 
415 
:"memory");

416 
//the following should be used, though better not with gcc ...

417 
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)

418 
:"r"(src1Stride), "r"(dstStride)

419 
:"memory");*/

420 
} 
421  
422 
static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 
423 
{ 
424 
__asm __volatile( 
425 
"testl $1, %0 \n\t"

426 
" jz 1f \n\t"

427 
"movq (%1), %%mm0 \n\t"

428 
"movq 8(%1), %%mm1 \n\t"

429 
PAVGB" (%2), %%mm0 \n\t"

430 
PAVGB" 8(%2), %%mm1 \n\t"

431 
"add %4, %1 \n\t"

432 
"add $16, %2 \n\t"

433 
PAVGB" (%3), %%mm0 \n\t"

434 
PAVGB" 8(%3), %%mm1 \n\t"

435 
"movq %%mm0, (%3) \n\t"

436 
"movq %%mm1, 8(%3) \n\t"

437 
"add %5, %3 \n\t"

438 
"decl %0 \n\t"

439 
"1: \n\t"

440 
"movq (%1), %%mm0 \n\t"

441 
"movq 8(%1), %%mm1 \n\t"

442 
"add %4, %1 \n\t"

443 
PAVGB" (%2), %%mm0 \n\t"

444 
PAVGB" 8(%2), %%mm1 \n\t"

445 
PAVGB" (%3), %%mm0 \n\t"

446 
PAVGB" 8(%3), %%mm1 \n\t"

447 
"movq %%mm0, (%3) \n\t"

448 
"movq %%mm1, 8(%3) \n\t"

449 
"add %5, %3 \n\t"

450 
"movq (%1), %%mm0 \n\t"

451 
"movq 8(%1), %%mm1 \n\t"

452 
"add %4, %1 \n\t"

453 
PAVGB" 16(%2), %%mm0 \n\t"

454 
PAVGB" 24(%2), %%mm1 \n\t"

455 
PAVGB" (%3), %%mm0 \n\t"

456 
PAVGB" 8(%3), %%mm1 \n\t"

457 
"movq %%mm0, (%3) \n\t"

458 
"movq %%mm1, 8(%3) \n\t"

459 
"add %5, %3 \n\t"

460 
"add $32, %2 \n\t"

461 
"subl $2, %0 \n\t"

462 
"jnz 1b \n\t"

463 
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used 
464 
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
465 
#else

466 
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
467 
#endif

468 
:"S"((long)src1Stride), "D"((long)dstStride) 
469 
:"memory");

470 
//the following should be used, though better not with gcc ...

471 
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)

472 
:"r"(src1Stride), "r"(dstStride)

473 
:"memory");*/

474 
} 
475  
476 
static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 
477 
{ 
478 
__asm __volatile( 
479 
"pcmpeqb %%mm6, %%mm6 \n\t"

480 
"testl $1, %0 \n\t"

481 
" jz 1f \n\t"

482 
"movq (%1), %%mm0 \n\t"

483 
"movq 8(%1), %%mm1 \n\t"

484 
"movq (%2), %%mm2 \n\t"

485 
"movq 8(%2), %%mm3 \n\t"

486 
"pxor %%mm6, %%mm0 \n\t"

487 
"pxor %%mm6, %%mm1 \n\t"

488 
"pxor %%mm6, %%mm2 \n\t"

489 
"pxor %%mm6, %%mm3 \n\t"

490 
PAVGB" %%mm2, %%mm0 \n\t"

491 
PAVGB" %%mm3, %%mm1 \n\t"

492 
"pxor %%mm6, %%mm0 \n\t"

493 
"pxor %%mm6, %%mm1 \n\t"

494 
"add %4, %1 \n\t"

495 
"add $16, %2 \n\t"

496 
"movq %%mm0, (%3) \n\t"

497 
"movq %%mm1, 8(%3) \n\t"

498 
"add %5, %3 \n\t"

499 
"decl %0 \n\t"

500 
"1: \n\t"

501 
"movq (%1), %%mm0 \n\t"

502 
"movq 8(%1), %%mm1 \n\t"

503 
"add %4, %1 \n\t"

504 
"movq (%2), %%mm2 \n\t"

505 
"movq 8(%2), %%mm3 \n\t"

506 
"pxor %%mm6, %%mm0 \n\t"

507 
"pxor %%mm6, %%mm1 \n\t"

508 
"pxor %%mm6, %%mm2 \n\t"

509 
"pxor %%mm6, %%mm3 \n\t"

510 
PAVGB" %%mm2, %%mm0 \n\t"

511 
PAVGB" %%mm3, %%mm1 \n\t"

512 
"pxor %%mm6, %%mm0 \n\t"

513 
"pxor %%mm6, %%mm1 \n\t"

514 
"movq %%mm0, (%3) \n\t"

515 
"movq %%mm1, 8(%3) \n\t"

516 
"add %5, %3 \n\t"

517 
"movq (%1), %%mm0 \n\t"

518 
"movq 8(%1), %%mm1 \n\t"

519 
"add %4, %1 \n\t"

520 
"movq 16(%2), %%mm2 \n\t"

521 
"movq 24(%2), %%mm3 \n\t"

522 
"pxor %%mm6, %%mm0 \n\t"

523 
"pxor %%mm6, %%mm1 \n\t"

524 
"pxor %%mm6, %%mm2 \n\t"

525 
"pxor %%mm6, %%mm3 \n\t"

526 
PAVGB" %%mm2, %%mm0 \n\t"

527 
PAVGB" %%mm3, %%mm1 \n\t"

528 
"pxor %%mm6, %%mm0 \n\t"

529 
"pxor %%mm6, %%mm1 \n\t"

530 
"movq %%mm0, (%3) \n\t"

531 
"movq %%mm1, 8(%3) \n\t"

532 
"add %5, %3 \n\t"

533 
"add $32, %2 \n\t"

534 
"subl $2, %0 \n\t"

535 
"jnz 1b \n\t"

536 
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used 
537 
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
538 
#else

539 
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 
540 
#endif

541 
:"S"((long)src1Stride), "D"((long)dstStride) 
542 
:"memory");

543 
//the following should be used, though better not with gcc ...

544 
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)

545 
:"r"(src1Stride), "r"(dstStride)

546 
:"memory");*/

547 
} 
548  
549 
/* GL: this function does incorrect rounding if overflow */

550 
static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
551 
{ 
552 
MOVQ_BONE(mm6); 
553 
__asm __volatile( 
554 
"lea (%3, %3), %%"REG_a" \n\t" 
555 
"1: \n\t"

556 
"movq (%1), %%mm0 \n\t"

557 
"movq (%1, %3), %%mm2 \n\t"

558 
"movq 1(%1), %%mm1 \n\t"

559 
"movq 1(%1, %3), %%mm3 \n\t"

560 
"add %%"REG_a", %1 \n\t" 
561 
"psubusb %%mm6, %%mm0 \n\t"

562 
"psubusb %%mm6, %%mm2 \n\t"

563 
PAVGB" %%mm1, %%mm0 \n\t"

564 
PAVGB" %%mm3, %%mm2 \n\t"

565 
"movq %%mm0, (%2) \n\t"

566 
"movq %%mm2, (%2, %3) \n\t"

567 
"movq (%1), %%mm0 \n\t"

568 
"movq 1(%1), %%mm1 \n\t"

569 
"movq (%1, %3), %%mm2 \n\t"

570 
"movq 1(%1, %3), %%mm3 \n\t"

571 
"add %%"REG_a", %2 \n\t" 
572 
"add %%"REG_a", %1 \n\t" 
573 
"psubusb %%mm6, %%mm0 \n\t"

574 
"psubusb %%mm6, %%mm2 \n\t"

575 
PAVGB" %%mm1, %%mm0 \n\t"

576 
PAVGB" %%mm3, %%mm2 \n\t"

577 
"movq %%mm0, (%2) \n\t"

578 
"movq %%mm2, (%2, %3) \n\t"

579 
"add %%"REG_a", %2 \n\t" 
580 
"subl $4, %0 \n\t"

581 
"jnz 1b \n\t"

582 
:"+g"(h), "+S"(pixels), "+D"(block) 
583 
:"r" ((long)line_size) 
584 
:"%"REG_a, "memory"); 
585 
} 
586  
587 
static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
588 
{ 
589 
__asm __volatile( 
590 
"lea (%3, %3), %%"REG_a" \n\t" 
591 
"movq (%1), %%mm0 \n\t"

592 
"sub %3, %2 \n\t"

593 
"1: \n\t"

594 
"movq (%1, %3), %%mm1 \n\t"

595 
"movq (%1, %%"REG_a"), %%mm2 \n\t" 
596 
"add %%"REG_a", %1 \n\t" 
597 
PAVGB" %%mm1, %%mm0 \n\t"

598 
PAVGB" %%mm2, %%mm1 \n\t"

599 
"movq %%mm0, (%2, %3) \n\t"

600 
"movq %%mm1, (%2, %%"REG_a") \n\t" 
601 
"movq (%1, %3), %%mm1 \n\t"

602 
"movq (%1, %%"REG_a"), %%mm0 \n\t" 
603 
"add %%"REG_a", %2 \n\t" 
604 
"add %%"REG_a", %1 \n\t" 
605 
PAVGB" %%mm1, %%mm2 \n\t"

606 
PAVGB" %%mm0, %%mm1 \n\t"

607 
"movq %%mm2, (%2, %3) \n\t"

608 
"movq %%mm1, (%2, %%"REG_a") \n\t" 
609 
"add %%"REG_a", %2 \n\t" 
610 
"subl $4, %0 \n\t"

611 
"jnz 1b \n\t"

612 
:"+g"(h), "+S"(pixels), "+D" (block) 
613 
:"r" ((long)line_size) 
614 
:"%"REG_a, "memory"); 
615 
} 
616  
617 
/* GL: this function does incorrect rounding if overflow */

618 
static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
619 
{ 
620 
MOVQ_BONE(mm6); 
621 
__asm __volatile( 
622 
"lea (%3, %3), %%"REG_a" \n\t" 
623 
"movq (%1), %%mm0 \n\t"

624 
"sub %3, %2 \n\t"

625 
"1: \n\t"

626 
"movq (%1, %3), %%mm1 \n\t"

627 
"movq (%1, %%"REG_a"), %%mm2 \n\t" 
628 
"add %%"REG_a", %1 \n\t" 
629 
"psubusb %%mm6, %%mm1 \n\t"

630 
PAVGB" %%mm1, %%mm0 \n\t"

631 
PAVGB" %%mm2, %%mm1 \n\t"

632 
"movq %%mm0, (%2, %3) \n\t"

633 
"movq %%mm1, (%2, %%"REG_a") \n\t" 
634 
"movq (%1, %3), %%mm1 \n\t"

635 
"movq (%1, %%"REG_a"), %%mm0 \n\t" 
636 
"add %%"REG_a", %2 \n\t" 
637 
"add %%"REG_a", %1 \n\t" 
638 
"psubusb %%mm6, %%mm1 \n\t"

639 
PAVGB" %%mm1, %%mm2 \n\t"

640 
PAVGB" %%mm0, %%mm1 \n\t"

641 
"movq %%mm2, (%2, %3) \n\t"

642 
"movq %%mm1, (%2, %%"REG_a") \n\t" 
643 
"add %%"REG_a", %2 \n\t" 
644 
"subl $4, %0 \n\t"

645 
"jnz 1b \n\t"

646 
:"+g"(h), "+S"(pixels), "+D" (block) 
647 
:"r" ((long)line_size) 
648 
:"%"REG_a, "memory"); 
649 
} 
650  
651 
static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
652 
{ 
653 
__asm __volatile( 
654 
"lea (%3, %3), %%"REG_a" \n\t" 
655 
"1: \n\t"

656 
"movq (%2), %%mm0 \n\t"

657 
"movq (%2, %3), %%mm1 \n\t"

658 
PAVGB" (%1), %%mm0 \n\t"

659 
PAVGB" (%1, %3), %%mm1 \n\t"

660 
"movq %%mm0, (%2) \n\t"

661 
"movq %%mm1, (%2, %3) \n\t"

662 
"add %%"REG_a", %1 \n\t" 
663 
"add %%"REG_a", %2 \n\t" 
664 
"movq (%2), %%mm0 \n\t"

665 
"movq (%2, %3), %%mm1 \n\t"

666 
PAVGB" (%1), %%mm0 \n\t"

667 
PAVGB" (%1, %3), %%mm1 \n\t"

668 
"add %%"REG_a", %1 \n\t" 
669 
"movq %%mm0, (%2) \n\t"

670 
"movq %%mm1, (%2, %3) \n\t"

671 
"add %%"REG_a", %2 \n\t" 
672 
"subl $4, %0 \n\t"

673 
"jnz 1b \n\t"

674 
:"+g"(h), "+S"(pixels), "+D"(block) 
675 
:"r" ((long)line_size) 
676 
:"%"REG_a, "memory"); 
677 
} 
678  
679 
static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
680 
{ 
681 
__asm __volatile( 
682 
"lea (%3, %3), %%"REG_a" \n\t" 
683 
"1: \n\t"

684 
"movq (%1), %%mm0 \n\t"

685 
"movq (%1, %3), %%mm2 \n\t"

686 
PAVGB" 1(%1), %%mm0 \n\t"

687 
PAVGB" 1(%1, %3), %%mm2 \n\t"

688 
PAVGB" (%2), %%mm0 \n\t"

689 
PAVGB" (%2, %3), %%mm2 \n\t"

690 
"add %%"REG_a", %1 \n\t" 
691 
"movq %%mm0, (%2) \n\t"

692 
"movq %%mm2, (%2, %3) \n\t"

693 
"movq (%1), %%mm0 \n\t"

694 
"movq (%1, %3), %%mm2 \n\t"

695 
PAVGB" 1(%1), %%mm0 \n\t"

696 
PAVGB" 1(%1, %3), %%mm2 \n\t"

697 
"add %%"REG_a", %2 \n\t" 
698 
"add %%"REG_a", %1 \n\t" 
699 
PAVGB" (%2), %%mm0 \n\t"

700 
PAVGB" (%2, %3), %%mm2 \n\t"

701 
"movq %%mm0, (%2) \n\t"

702 
"movq %%mm2, (%2, %3) \n\t"

703 
"add %%"REG_a", %2 \n\t" 
704 
"subl $4, %0 \n\t"

705 
"jnz 1b \n\t"

706 
:"+g"(h), "+S"(pixels), "+D"(block) 
707 
:"r" ((long)line_size) 
708 
:"%"REG_a, "memory"); 
709 
} 
710  
711 
static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
712 
{ 
713 
__asm __volatile( 
714 
"lea (%3, %3), %%"REG_a" \n\t" 
715 
"movq (%1), %%mm0 \n\t"

716 
"sub %3, %2 \n\t"

717 
"1: \n\t"

718 
"movq (%1, %3), %%mm1 \n\t"

719 
"movq (%1, %%"REG_a"), %%mm2 \n\t" 
720 
"add %%"REG_a", %1 \n\t" 
721 
PAVGB" %%mm1, %%mm0 \n\t"

722 
PAVGB" %%mm2, %%mm1 \n\t"

723 
"movq (%2, %3), %%mm3 \n\t"

724 
"movq (%2, %%"REG_a"), %%mm4 \n\t" 
725 
PAVGB" %%mm3, %%mm0 \n\t"

726 
PAVGB" %%mm4, %%mm1 \n\t"

727 
"movq %%mm0, (%2, %3) \n\t"

728 
"movq %%mm1, (%2, %%"REG_a") \n\t" 
729 
"movq (%1, %3), %%mm1 \n\t"

730 
"movq (%1, %%"REG_a"), %%mm0 \n\t" 
731 
PAVGB" %%mm1, %%mm2 \n\t"

732 
PAVGB" %%mm0, %%mm1 \n\t"

733 
"add %%"REG_a", %2 \n\t" 
734 
"add %%"REG_a", %1 \n\t" 
735 
"movq (%2, %3), %%mm3 \n\t"

736 
"movq (%2, %%"REG_a"), %%mm4 \n\t" 
737 
PAVGB" %%mm3, %%mm2 \n\t"

738 
PAVGB" %%mm4, %%mm1 \n\t"

739 
"movq %%mm2, (%2, %3) \n\t"

740 
"movq %%mm1, (%2, %%"REG_a") \n\t" 
741 
"add %%"REG_a", %2 \n\t" 
742 
"subl $4, %0 \n\t"

743 
"jnz 1b \n\t"

744 
:"+g"(h), "+S"(pixels), "+D"(block) 
745 
:"r" ((long)line_size) 
746 
:"%"REG_a, "memory"); 
747 
} 
748  
749 
// Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter

750 
static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
751 
{ 
752 
MOVQ_BONE(mm6); 
753 
__asm __volatile( 
754 
"lea (%3, %3), %%"REG_a" \n\t" 
755 
"movq (%1), %%mm0 \n\t"

756 
PAVGB" 1(%1), %%mm0 \n\t"

757 
".balign 8 \n\t"

758 
"1: \n\t"

759 
"movq (%1, %%"REG_a"), %%mm2 \n\t" 
760 
"movq (%1, %3), %%mm1 \n\t"

761 
"psubusb %%mm6, %%mm2 \n\t"

762 
PAVGB" 1(%1, %3), %%mm1 \n\t"

763 
PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t" 
764 
"add %%"REG_a", %1 \n\t" 
765 
PAVGB" %%mm1, %%mm0 \n\t"

766 
PAVGB" %%mm2, %%mm1 \n\t"

767 
PAVGB" (%2), %%mm0 \n\t"

768 
PAVGB" (%2, %3), %%mm1 \n\t"

769 
"movq %%mm0, (%2) \n\t"

770 
"movq %%mm1, (%2, %3) \n\t"

771 
"movq (%1, %3), %%mm1 \n\t"

772 
"movq (%1, %%"REG_a"), %%mm0 \n\t" 
773 
PAVGB" 1(%1, %3), %%mm1 \n\t"

774 
PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t" 
775 
"add %%"REG_a", %2 \n\t" 
776 
"add %%"REG_a", %1 \n\t" 
777 
PAVGB" %%mm1, %%mm2 \n\t"

778 
PAVGB" %%mm0, %%mm1 \n\t"

779 
PAVGB" (%2), %%mm2 \n\t"

780 
PAVGB" (%2, %3), %%mm1 \n\t"

781 
"movq %%mm2, (%2) \n\t"

782 
"movq %%mm1, (%2, %3) \n\t"

783 
"add %%"REG_a", %2 \n\t" 
784 
"subl $4, %0 \n\t"

785 
"jnz 1b \n\t"

786 
:"+g"(h), "+S"(pixels), "+D"(block) 
787 
:"r" ((long)line_size) 
788 
:"%"REG_a, "memory"); 
789 
} 
790  
791 
//FIXME the following could be optimized too ...

792 
static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 
793 
DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); 
794 
DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); 
795 
} 
796 
static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 
797 
DEF(put_pixels8_y2)(block , pixels , line_size, h); 
798 
DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h); 
799 
} 
800 
static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 
801 
DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h); 
802 
DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); 
803 
} 
804 
static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 
805 
DEF(avg_pixels8)(block , pixels , line_size, h); 
806 
DEF(avg_pixels8)(block+8, pixels+8, line_size, h); 
807 
} 
808 
static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 
809 
DEF(avg_pixels8_x2)(block , pixels , line_size, h); 
810 
DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h); 
811 
} 
812 
static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 
813 
DEF(avg_pixels8_y2)(block , pixels , line_size, h); 
814 
DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h); 
815 
} 
816 
static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ 
817 
DEF(avg_pixels8_xy2)(block , pixels , line_size, h); 
818 
DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); 
819 
} 
820 