Statistics
| Branch: | Revision:

ffmpeg / libavcodec / alpha / motion_est_alpha.c @ f9bb4bdf

History | View | Annotate | Download (9.54 KB)

1
/*
2
 * Alpha optimized DSP utils
3
 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 */
19

    
20
#include "asm.h"
21
#include "../dsputil.h"
22

    
23
void get_pixels_mvi(DCTELEM *restrict block,
24
                    const uint8_t *restrict pixels, int line_size)
25
{
26
    int h = 8;
27

    
28
    do {
29
        uint64_t p;
30

    
31
        p = ldq(pixels);
32
        stq(unpkbw(p),       block);
33
        stq(unpkbw(p >> 32), block + 4); 
34

    
35
        pixels += line_size;
36
        block += 8;
37
    } while (--h);
38
}
39

    
40
void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2,
41
                     int stride) {
42
    int h = 8;
43
    uint64_t mask = 0x4040;
44

    
45
    mask |= mask << 16;
46
    mask |= mask << 32;
47
    do {
48
        uint64_t x, y, c, d, a;
49
        uint64_t signs;
50

    
51
        x = ldq(s1);
52
        y = ldq(s2);
53
        c = cmpbge(x, y);
54
        d = x - y;
55
        a = zap(mask, c);       /* We use 0x4040404040404040 here...  */
56
        d += 4 * a;             /* ...so we can use s4addq here.      */
57
        signs = zap(-1, c);
58

    
59
        stq(unpkbw(d)       | (unpkbw(signs)       << 8), block);
60
        stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4);
61

    
62
        s1 += stride;
63
        s2 += stride;
64
        block += 8;
65
    } while (--h);
66
}
67

    
68
static inline uint64_t avg2(uint64_t a, uint64_t b)
69
{
70
    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
71
}
72

    
73
static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
74
{
75
    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
76
                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
77
                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
78
                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
79
    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
80
                    + (l2 & BYTE_VEC(0x03))
81
                    + (l3 & BYTE_VEC(0x03))
82
                    + (l4 & BYTE_VEC(0x03))
83
                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
84
    return r1 + r2;
85
}
86

    
87
int pix_abs8x8_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
88
{
89
    int result = 0;
90
    int h = 8;
91

    
92
    if ((size_t) pix2 & 0x7) {
93
        /* works only when pix2 is actually unaligned */
94
        do {                    /* do 8 pixel a time */
95
            uint64_t p1, p2;
96

    
97
            p1  = ldq(pix1);
98
            p2  = uldq(pix2);
99
            result += perr(p1, p2);
100

    
101
            pix1 += line_size;
102
            pix2 += line_size;
103
        } while (--h);
104
    } else {
105
        do {
106
            uint64_t p1, p2;
107

    
108
            p1 = ldq(pix1);
109
            p2 = ldq(pix2);
110
            result += perr(p1, p2);
111

    
112
            pix1 += line_size;
113
            pix2 += line_size;
114
        } while (--h);
115
    }
116

    
117
    return result;
118
}
119

    
120
#if 0                                /* now done in assembly */
121
int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
122
{
123
    int result = 0;
124
    int h = 16;
125

126
    if ((size_t) pix2 & 0x7) {
127
        /* works only when pix2 is actually unaligned */
128
        do {                    /* do 16 pixel a time */
129
            uint64_t p1_l, p1_r, p2_l, p2_r;
130
            uint64_t t;
131

132
            p1_l  = ldq(pix1);
133
            p1_r  = ldq(pix1 + 8);
134
            t     = ldq_u(pix2 + 8);
135
            p2_l  = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
136
            p2_r  = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
137
            pix1 += line_size;
138
            pix2 += line_size;
139

140
            result += perr(p1_l, p2_l)
141
                    + perr(p1_r, p2_r);
142
        } while (--h);
143
    } else {
144
        do {
145
            uint64_t p1_l, p1_r, p2_l, p2_r;
146

147
            p1_l = ldq(pix1);
148
            p1_r = ldq(pix1 + 8);
149
            p2_l = ldq(pix2);
150
            p2_r = ldq(pix2 + 8);
151
            pix1 += line_size;
152
            pix2 += line_size;
153

154
            result += perr(p1_l, p2_l)
155
                    + perr(p1_r, p2_r);
156
        } while (--h);
157
    }
158

159
    return result;
160
}
161
#endif
162

    
163
int pix_abs16x16_x2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
164
{
165
    int result = 0;
166
    int h = 16;
167
    uint64_t disalign = (size_t) pix2 & 0x7;
168

    
169
    switch (disalign) {
170
    case 0:
171
        do {
172
            uint64_t p1_l, p1_r, p2_l, p2_r;
173
            uint64_t l, r;
174

    
175
            p1_l = ldq(pix1);
176
            p1_r = ldq(pix1 + 8);
177
            l    = ldq(pix2);
178
            r    = ldq(pix2 + 8);
179
            p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
180
            p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
181
            pix1 += line_size;
182
            pix2 += line_size;
183

    
184
            result += perr(p1_l, p2_l)
185
                    + perr(p1_r, p2_r);
186
        } while (--h);
187
        break;
188
    case 7:
189
        /* |.......l|lllllllr|rrrrrrr*|
190
           This case is special because disalign1 would be 8, which
191
           gets treated as 0 by extqh.  At least it is a bit faster
192
           that way :)  */   
193
        do {
194
            uint64_t p1_l, p1_r, p2_l, p2_r;
195
            uint64_t l, m, r;
196

    
197
            p1_l = ldq(pix1);
198
            p1_r = ldq(pix1 + 8);
199
            l     = ldq_u(pix2);
200
            m     = ldq_u(pix2 + 8);
201
            r     = ldq_u(pix2 + 16);
202
            p2_l  = avg2(extql(l, disalign) | extqh(m, disalign), m);
203
            p2_r  = avg2(extql(m, disalign) | extqh(r, disalign), r);
204
            pix1 += line_size;
205
            pix2 += line_size;
206
            
207
            result += perr(p1_l, p2_l)
208
                    + perr(p1_r, p2_r);
209
        } while (--h);
210
        break;
211
    default:
212
        do {
213
            uint64_t disalign1 = disalign + 1;
214
            uint64_t p1_l, p1_r, p2_l, p2_r;
215
            uint64_t l, m, r;
216

    
217
            p1_l  = ldq(pix1);
218
            p1_r  = ldq(pix1 + 8);
219
            l     = ldq_u(pix2);
220
            m     = ldq_u(pix2 + 8);
221
            r     = ldq_u(pix2 + 16);
222
            p2_l  = avg2(extql(l, disalign) | extqh(m, disalign),
223
                         extql(l, disalign1) | extqh(m, disalign1));
224
            p2_r  = avg2(extql(m, disalign) | extqh(r, disalign),
225
                         extql(m, disalign1) | extqh(r, disalign1));
226
            pix1 += line_size;
227
            pix2 += line_size;
228

    
229
            result += perr(p1_l, p2_l)
230
                    + perr(p1_r, p2_r);
231
        } while (--h);
232
        break;
233
    }
234
    return result;
235
}
236

    
237
int pix_abs16x16_y2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
238
{
239
    int result = 0;
240
    int h = 16;
241

    
242
    if ((size_t) pix2 & 0x7) {
243
        uint64_t t, p2_l, p2_r;
244
        t     = ldq_u(pix2 + 8);
245
        p2_l  = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
246
        p2_r  = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
247

    
248
        do {
249
            uint64_t p1_l, p1_r, np2_l, np2_r;
250
            uint64_t t;
251

    
252
            p1_l  = ldq(pix1);
253
            p1_r  = ldq(pix1 + 8);
254
            pix2 += line_size;
255
            t     = ldq_u(pix2 + 8);
256
            np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
257
            np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
258

    
259
            result += perr(p1_l, avg2(p2_l, np2_l))
260
                    + perr(p1_r, avg2(p2_r, np2_r));
261

    
262
            pix1 += line_size;
263
            p2_l  = np2_l;
264
            p2_r  = np2_r;
265

    
266
        } while (--h);
267
    } else {
268
        uint64_t p2_l, p2_r;
269
        p2_l = ldq(pix2);
270
        p2_r = ldq(pix2 + 8);
271
        do {
272
            uint64_t p1_l, p1_r, np2_l, np2_r;
273

    
274
            p1_l = ldq(pix1);
275
            p1_r = ldq(pix1 + 8);
276
            pix2 += line_size;
277
            np2_l = ldq(pix2);
278
            np2_r = ldq(pix2 + 8);
279

    
280
            result += perr(p1_l, avg2(p2_l, np2_l))
281
                    + perr(p1_r, avg2(p2_r, np2_r));
282

    
283
            pix1 += line_size;
284
            p2_l  = np2_l;
285
            p2_r  = np2_r;
286
        } while (--h);
287
    }
288
    return result;
289
}
290

    
291
int pix_abs16x16_xy2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
292
{
293
    int result = 0;
294
    int h = 16;
295
    
296
    uint64_t p1_l, p1_r;
297
    uint64_t p2_l, p2_r, p2_x;
298

    
299
    p1_l = ldq(pix1);
300
    p1_r = ldq(pix1 + 8);
301

    
302
    if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
303
        p2_l = uldq(pix2);
304
        p2_r = uldq(pix2 + 8);
305
        p2_x = (uint64_t) pix2[16] << 56;
306
    } else {
307
        p2_l = ldq(pix2);
308
        p2_r = ldq(pix2 + 8);
309
        p2_x = ldq(pix2 + 16) << 56;
310
    }
311

    
312
    do {
313
        uint64_t np1_l, np1_r;
314
        uint64_t np2_l, np2_r, np2_x;
315

    
316
        pix1 += line_size;
317
        pix2 += line_size;
318

    
319
        np1_l = ldq(pix1);
320
        np1_r = ldq(pix1 + 8);
321

    
322
        if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
323
            np2_l = uldq(pix2);
324
            np2_r = uldq(pix2 + 8);
325
            np2_x = (uint64_t) pix2[16] << 56;
326
        } else {
327
            np2_l = ldq(pix2);
328
            np2_r = ldq(pix2 + 8);
329
            np2_x = ldq(pix2 + 16) << 56;
330
        }
331

    
332
        result += perr(p1_l,
333
                       avg4( p2_l, ( p2_l >> 8) | ((uint64_t)  p2_r << 56),
334
                            np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
335
                + perr(p1_r,
336
                       avg4( p2_r, ( p2_r >> 8) | ((uint64_t)  p2_x),
337
                            np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
338

    
339
        p1_l = np1_l;
340
        p1_r = np1_r;
341
        p2_l = np2_l;
342
        p2_r = np2_r;
343
        p2_x = np2_x;
344
    } while (--h);
345

    
346
    return result;
347
}