ffmpeg / libavcodec / alpha / motion_est_alpha.c @ 2e63619f
History  View  Annotate  Download (9.6 KB)
1 
/*


2 
* Alpha optimized DSP utils

3 
* Copyright (c) 2002 Falk Hueffner <falk@debian.org>

4 
*

5 
* This file is part of FFmpeg.

6 
*

7 
* FFmpeg is free software; you can redistribute it and/or

8 
* modify it under the terms of the GNU Lesser General Public

9 
* License as published by the Free Software Foundation; either

10 
* version 2.1 of the License, or (at your option) any later version.

11 
*

12 
* FFmpeg is distributed in the hope that it will be useful,

13 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

14 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

15 
* Lesser General Public License for more details.

16 
*

17 
* You should have received a copy of the GNU Lesser General Public

18 
* License along with FFmpeg; if not, write to the Free Software

19 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

20 
*/

21  
22 
#include "libavcodec/dsputil.h" 
23 
#include "dsputil_alpha.h" 
24 
#include "asm.h" 
25  
26 
void get_pixels_mvi(DCTELEM *restrict block, 
27 
const uint8_t *restrict pixels, int line_size) 
28 
{ 
29 
int h = 8; 
30  
31 
do {

32 
uint64_t p; 
33  
34 
p = ldq(pixels); 
35 
stq(unpkbw(p), block); 
36 
stq(unpkbw(p >> 32), block + 4); 
37  
38 
pixels += line_size; 
39 
block += 8;

40 
} while (h);

41 
} 
42  
43 
void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, 
44 
int stride) {

45 
int h = 8; 
46 
uint64_t mask = 0x4040;

47  
48 
mask = mask << 16;

49 
mask = mask << 32;

50 
do {

51 
uint64_t x, y, c, d, a; 
52 
uint64_t signs; 
53  
54 
x = ldq(s1); 
55 
y = ldq(s2); 
56 
c = cmpbge(x, y); 
57 
d = x  y; 
58 
a = zap(mask, c); /* We use 0x4040404040404040 here... */

59 
d += 4 * a; /* ...so we can use s4addq here. */ 
60 
signs = zap(1, c);

61  
62 
stq(unpkbw(d)  (unpkbw(signs) << 8), block);

63 
stq(unpkbw(d >> 32)  (unpkbw(signs >> 32) << 8), block + 4); 
64  
65 
s1 += stride; 
66 
s2 += stride; 
67 
block += 8;

68 
} while (h);

69 
} 
70  
71 
static inline uint64_t avg2(uint64_t a, uint64_t b) 
72 
{ 
73 
return (a  b)  (((a ^ b) & BYTE_VEC(0xfe)) >> 1); 
74 
} 
75  
76 
static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) 
77 
{ 
78 
uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) 
79 
+ ((l2 & ~BYTE_VEC(0x03)) >> 2) 
80 
+ ((l3 & ~BYTE_VEC(0x03)) >> 2) 
81 
+ ((l4 & ~BYTE_VEC(0x03)) >> 2); 
82 
uint64_t r2 = (( (l1 & BYTE_VEC(0x03))

83 
+ (l2 & BYTE_VEC(0x03))

84 
+ (l3 & BYTE_VEC(0x03))

85 
+ (l4 & BYTE_VEC(0x03))

86 
+ BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); 
87 
return r1 + r2;

88 
} 
89  
90 
int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
91 
{ 
92 
int result = 0; 
93  
94 
if ((size_t) pix2 & 0x7) { 
95 
/* works only when pix2 is actually unaligned */

96 
do { /* do 8 pixel a time */ 
97 
uint64_t p1, p2; 
98  
99 
p1 = ldq(pix1); 
100 
p2 = uldq(pix2); 
101 
result += perr(p1, p2); 
102  
103 
pix1 += line_size; 
104 
pix2 += line_size; 
105 
} while (h);

106 
} else {

107 
do {

108 
uint64_t p1, p2; 
109  
110 
p1 = ldq(pix1); 
111 
p2 = ldq(pix2); 
112 
result += perr(p1, p2); 
113  
114 
pix1 += line_size; 
115 
pix2 += line_size; 
116 
} while (h);

117 
} 
118  
119 
return result;

120 
} 
121  
122 
#if 0 /* now done in assembly */

123 
int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)

124 
{

125 
int result = 0;

126 
int h = 16;

127 

128 
if ((size_t) pix2 & 0x7) {

129 
/* works only when pix2 is actually unaligned */

130 
do { /* do 16 pixel a time */

131 
uint64_t p1_l, p1_r, p2_l, p2_r;

132 
uint64_t t;

133 

134 
p1_l = ldq(pix1);

135 
p1_r = ldq(pix1 + 8);

136 
t = ldq_u(pix2 + 8);

137 
p2_l = extql(ldq_u(pix2), pix2)  extqh(t, pix2);

138 
p2_r = extql(t, pix2)  extqh(ldq_u(pix2 + 16), pix2);

139 
pix1 += line_size;

140 
pix2 += line_size;

141 

142 
result += perr(p1_l, p2_l)

143 
+ perr(p1_r, p2_r);

144 
} while (h);

145 
} else {

146 
do {

147 
uint64_t p1_l, p1_r, p2_l, p2_r;

148 

149 
p1_l = ldq(pix1);

150 
p1_r = ldq(pix1 + 8);

151 
p2_l = ldq(pix2);

152 
p2_r = ldq(pix2 + 8);

153 
pix1 += line_size;

154 
pix2 += line_size;

155 

156 
result += perr(p1_l, p2_l)

157 
+ perr(p1_r, p2_r);

158 
} while (h);

159 
}

160 

161 
return result;

162 
}

163 
#endif

164  
165 
int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
166 
{ 
167 
int result = 0; 
168 
uint64_t disalign = (size_t) pix2 & 0x7;

169  
170 
switch (disalign) {

171 
case 0: 
172 
do {

173 
uint64_t p1_l, p1_r, p2_l, p2_r; 
174 
uint64_t l, r; 
175  
176 
p1_l = ldq(pix1); 
177 
p1_r = ldq(pix1 + 8);

178 
l = ldq(pix2); 
179 
r = ldq(pix2 + 8);

180 
p2_l = avg2(l, (l >> 8)  ((uint64_t) r << 56)); 
181 
p2_r = avg2(r, (r >> 8)  ((uint64_t) pix2[16] << 56)); 
182 
pix1 += line_size; 
183 
pix2 += line_size; 
184  
185 
result += perr(p1_l, p2_l) 
186 
+ perr(p1_r, p2_r); 
187 
} while (h);

188 
break;

189 
case 7: 
190 
/* .......llllllllrrrrrrrr*

191 
This case is special because disalign1 would be 8, which

192 
gets treated as 0 by extqh. At least it is a bit faster

193 
that way :) */

194 
do {

195 
uint64_t p1_l, p1_r, p2_l, p2_r; 
196 
uint64_t l, m, r; 
197  
198 
p1_l = ldq(pix1); 
199 
p1_r = ldq(pix1 + 8);

200 
l = ldq_u(pix2); 
201 
m = ldq_u(pix2 + 8);

202 
r = ldq_u(pix2 + 16);

203 
p2_l = avg2(extql(l, disalign)  extqh(m, disalign), m); 
204 
p2_r = avg2(extql(m, disalign)  extqh(r, disalign), r); 
205 
pix1 += line_size; 
206 
pix2 += line_size; 
207  
208 
result += perr(p1_l, p2_l) 
209 
+ perr(p1_r, p2_r); 
210 
} while (h);

211 
break;

212 
default:

213 
do {

214 
uint64_t disalign1 = disalign + 1;

215 
uint64_t p1_l, p1_r, p2_l, p2_r; 
216 
uint64_t l, m, r; 
217  
218 
p1_l = ldq(pix1); 
219 
p1_r = ldq(pix1 + 8);

220 
l = ldq_u(pix2); 
221 
m = ldq_u(pix2 + 8);

222 
r = ldq_u(pix2 + 16);

223 
p2_l = avg2(extql(l, disalign)  extqh(m, disalign), 
224 
extql(l, disalign1)  extqh(m, disalign1)); 
225 
p2_r = avg2(extql(m, disalign)  extqh(r, disalign), 
226 
extql(m, disalign1)  extqh(r, disalign1)); 
227 
pix1 += line_size; 
228 
pix2 += line_size; 
229  
230 
result += perr(p1_l, p2_l) 
231 
+ perr(p1_r, p2_r); 
232 
} while (h);

233 
break;

234 
} 
235 
return result;

236 
} 
237  
238 
int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
239 
{ 
240 
int result = 0; 
241  
242 
if ((size_t) pix2 & 0x7) { 
243 
uint64_t t, p2_l, p2_r; 
244 
t = ldq_u(pix2 + 8);

245 
p2_l = extql(ldq_u(pix2), pix2)  extqh(t, pix2); 
246 
p2_r = extql(t, pix2)  extqh(ldq_u(pix2 + 16), pix2);

247  
248 
do {

249 
uint64_t p1_l, p1_r, np2_l, np2_r; 
250 
uint64_t t; 
251  
252 
p1_l = ldq(pix1); 
253 
p1_r = ldq(pix1 + 8);

254 
pix2 += line_size; 
255 
t = ldq_u(pix2 + 8);

256 
np2_l = extql(ldq_u(pix2), pix2)  extqh(t, pix2); 
257 
np2_r = extql(t, pix2)  extqh(ldq_u(pix2 + 16), pix2);

258  
259 
result += perr(p1_l, avg2(p2_l, np2_l)) 
260 
+ perr(p1_r, avg2(p2_r, np2_r)); 
261  
262 
pix1 += line_size; 
263 
p2_l = np2_l; 
264 
p2_r = np2_r; 
265  
266 
} while (h);

267 
} else {

268 
uint64_t p2_l, p2_r; 
269 
p2_l = ldq(pix2); 
270 
p2_r = ldq(pix2 + 8);

271 
do {

272 
uint64_t p1_l, p1_r, np2_l, np2_r; 
273  
274 
p1_l = ldq(pix1); 
275 
p1_r = ldq(pix1 + 8);

276 
pix2 += line_size; 
277 
np2_l = ldq(pix2); 
278 
np2_r = ldq(pix2 + 8);

279  
280 
result += perr(p1_l, avg2(p2_l, np2_l)) 
281 
+ perr(p1_r, avg2(p2_r, np2_r)); 
282  
283 
pix1 += line_size; 
284 
p2_l = np2_l; 
285 
p2_r = np2_r; 
286 
} while (h);

287 
} 
288 
return result;

289 
} 
290  
291 
int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
292 
{ 
293 
int result = 0; 
294  
295 
uint64_t p1_l, p1_r; 
296 
uint64_t p2_l, p2_r, p2_x; 
297  
298 
p1_l = ldq(pix1); 
299 
p1_r = ldq(pix1 + 8);

300  
301 
if ((size_t) pix2 & 0x7) { /* could be optimized a lot */ 
302 
p2_l = uldq(pix2); 
303 
p2_r = uldq(pix2 + 8);

304 
p2_x = (uint64_t) pix2[16] << 56; 
305 
} else {

306 
p2_l = ldq(pix2); 
307 
p2_r = ldq(pix2 + 8);

308 
p2_x = ldq(pix2 + 16) << 56; 
309 
} 
310  
311 
do {

312 
uint64_t np1_l, np1_r; 
313 
uint64_t np2_l, np2_r, np2_x; 
314  
315 
pix1 += line_size; 
316 
pix2 += line_size; 
317  
318 
np1_l = ldq(pix1); 
319 
np1_r = ldq(pix1 + 8);

320  
321 
if ((size_t) pix2 & 0x7) { /* could be optimized a lot */ 
322 
np2_l = uldq(pix2); 
323 
np2_r = uldq(pix2 + 8);

324 
np2_x = (uint64_t) pix2[16] << 56; 
325 
} else {

326 
np2_l = ldq(pix2); 
327 
np2_r = ldq(pix2 + 8);

328 
np2_x = ldq(pix2 + 16) << 56; 
329 
} 
330  
331 
result += perr(p1_l, 
332 
avg4( p2_l, ( p2_l >> 8)  ((uint64_t) p2_r << 56), 
333 
np2_l, (np2_l >> 8)  ((uint64_t) np2_r << 56))) 
334 
+ perr(p1_r, 
335 
avg4( p2_r, ( p2_r >> 8)  ((uint64_t) p2_x),

336 
np2_r, (np2_r >> 8)  ((uint64_t) np2_x)));

337  
338 
p1_l = np1_l; 
339 
p1_r = np1_r; 
340 
p2_l = np2_l; 
341 
p2_r = np2_r; 
342 
p2_x = np2_x; 
343 
} while (h);

344  
345 
return result;

346 
} 