Statistics
| Branch: | Revision:

ffmpeg / libavcodec / alpha / motion_est_alpha.c @ b550bfaa

History | View | Annotate | Download (9.57 KB)

1
/*
2
 * Alpha optimized DSP utils
3
 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

    
22
#include "asm.h"
23
#include "dsputil.h"
24

    
25
void get_pixels_mvi(DCTELEM *restrict block,
26
                    const uint8_t *restrict pixels, int line_size)
27
{
28
    int h = 8;
29

    
30
    do {
31
        uint64_t p;
32

    
33
        p = ldq(pixels);
34
        stq(unpkbw(p),       block);
35
        stq(unpkbw(p >> 32), block + 4);
36

    
37
        pixels += line_size;
38
        block += 8;
39
    } while (--h);
40
}
41

    
42
void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2,
43
                     int stride) {
44
    int h = 8;
45
    uint64_t mask = 0x4040;
46

    
47
    mask |= mask << 16;
48
    mask |= mask << 32;
49
    do {
50
        uint64_t x, y, c, d, a;
51
        uint64_t signs;
52

    
53
        x = ldq(s1);
54
        y = ldq(s2);
55
        c = cmpbge(x, y);
56
        d = x - y;
57
        a = zap(mask, c);       /* We use 0x4040404040404040 here...  */
58
        d += 4 * a;             /* ...so we can use s4addq here.      */
59
        signs = zap(-1, c);
60

    
61
        stq(unpkbw(d)       | (unpkbw(signs)       << 8), block);
62
        stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4);
63

    
64
        s1 += stride;
65
        s2 += stride;
66
        block += 8;
67
    } while (--h);
68
}
69

    
70
static inline uint64_t avg2(uint64_t a, uint64_t b)
71
{
72
    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
73
}
74

    
75
static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
76
{
77
    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
78
                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
79
                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
80
                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
81
    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
82
                    + (l2 & BYTE_VEC(0x03))
83
                    + (l3 & BYTE_VEC(0x03))
84
                    + (l4 & BYTE_VEC(0x03))
85
                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
86
    return r1 + r2;
87
}
88

    
89
int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
90
{
91
    int result = 0;
92

    
93
    if ((size_t) pix2 & 0x7) {
94
        /* works only when pix2 is actually unaligned */
95
        do {                    /* do 8 pixel a time */
96
            uint64_t p1, p2;
97

    
98
            p1  = ldq(pix1);
99
            p2  = uldq(pix2);
100
            result += perr(p1, p2);
101

    
102
            pix1 += line_size;
103
            pix2 += line_size;
104
        } while (--h);
105
    } else {
106
        do {
107
            uint64_t p1, p2;
108

    
109
            p1 = ldq(pix1);
110
            p2 = ldq(pix2);
111
            result += perr(p1, p2);
112

    
113
            pix1 += line_size;
114
            pix2 += line_size;
115
        } while (--h);
116
    }
117

    
118
    return result;
119
}
120

    
121
#if 0                           /* now done in assembly */
122
int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
123
{
124
    int result = 0;
125
    int h = 16;
126

127
    if ((size_t) pix2 & 0x7) {
128
        /* works only when pix2 is actually unaligned */
129
        do {                    /* do 16 pixel a time */
130
            uint64_t p1_l, p1_r, p2_l, p2_r;
131
            uint64_t t;
132

133
            p1_l  = ldq(pix1);
134
            p1_r  = ldq(pix1 + 8);
135
            t     = ldq_u(pix2 + 8);
136
            p2_l  = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
137
            p2_r  = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
138
            pix1 += line_size;
139
            pix2 += line_size;
140

141
            result += perr(p1_l, p2_l)
142
                    + perr(p1_r, p2_r);
143
        } while (--h);
144
    } else {
145
        do {
146
            uint64_t p1_l, p1_r, p2_l, p2_r;
147

148
            p1_l = ldq(pix1);
149
            p1_r = ldq(pix1 + 8);
150
            p2_l = ldq(pix2);
151
            p2_r = ldq(pix2 + 8);
152
            pix1 += line_size;
153
            pix2 += line_size;
154

155
            result += perr(p1_l, p2_l)
156
                    + perr(p1_r, p2_r);
157
        } while (--h);
158
    }
159

160
    return result;
161
}
162
#endif
163

    
164
int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
165
{
166
    int result = 0;
167
    uint64_t disalign = (size_t) pix2 & 0x7;
168

    
169
    switch (disalign) {
170
    case 0:
171
        do {
172
            uint64_t p1_l, p1_r, p2_l, p2_r;
173
            uint64_t l, r;
174

    
175
            p1_l = ldq(pix1);
176
            p1_r = ldq(pix1 + 8);
177
            l    = ldq(pix2);
178
            r    = ldq(pix2 + 8);
179
            p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
180
            p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
181
            pix1 += line_size;
182
            pix2 += line_size;
183

    
184
            result += perr(p1_l, p2_l)
185
                    + perr(p1_r, p2_r);
186
        } while (--h);
187
        break;
188
    case 7:
189
        /* |.......l|lllllllr|rrrrrrr*|
190
           This case is special because disalign1 would be 8, which
191
           gets treated as 0 by extqh.  At least it is a bit faster
192
           that way :)  */
193
        do {
194
            uint64_t p1_l, p1_r, p2_l, p2_r;
195
            uint64_t l, m, r;
196

    
197
            p1_l = ldq(pix1);
198
            p1_r = ldq(pix1 + 8);
199
            l     = ldq_u(pix2);
200
            m     = ldq_u(pix2 + 8);
201
            r     = ldq_u(pix2 + 16);
202
            p2_l  = avg2(extql(l, disalign) | extqh(m, disalign), m);
203
            p2_r  = avg2(extql(m, disalign) | extqh(r, disalign), r);
204
            pix1 += line_size;
205
            pix2 += line_size;
206

    
207
            result += perr(p1_l, p2_l)
208
                    + perr(p1_r, p2_r);
209
        } while (--h);
210
        break;
211
    default:
212
        do {
213
            uint64_t disalign1 = disalign + 1;
214
            uint64_t p1_l, p1_r, p2_l, p2_r;
215
            uint64_t l, m, r;
216

    
217
            p1_l  = ldq(pix1);
218
            p1_r  = ldq(pix1 + 8);
219
            l     = ldq_u(pix2);
220
            m     = ldq_u(pix2 + 8);
221
            r     = ldq_u(pix2 + 16);
222
            p2_l  = avg2(extql(l, disalign) | extqh(m, disalign),
223
                         extql(l, disalign1) | extqh(m, disalign1));
224
            p2_r  = avg2(extql(m, disalign) | extqh(r, disalign),
225
                         extql(m, disalign1) | extqh(r, disalign1));
226
            pix1 += line_size;
227
            pix2 += line_size;
228

    
229
            result += perr(p1_l, p2_l)
230
                    + perr(p1_r, p2_r);
231
        } while (--h);
232
        break;
233
    }
234
    return result;
235
}
236

    
237
int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
238
{
239
    int result = 0;
240

    
241
    if ((size_t) pix2 & 0x7) {
242
        uint64_t t, p2_l, p2_r;
243
        t     = ldq_u(pix2 + 8);
244
        p2_l  = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
245
        p2_r  = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
246

    
247
        do {
248
            uint64_t p1_l, p1_r, np2_l, np2_r;
249
            uint64_t t;
250

    
251
            p1_l  = ldq(pix1);
252
            p1_r  = ldq(pix1 + 8);
253
            pix2 += line_size;
254
            t     = ldq_u(pix2 + 8);
255
            np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
256
            np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
257

    
258
            result += perr(p1_l, avg2(p2_l, np2_l))
259
                    + perr(p1_r, avg2(p2_r, np2_r));
260

    
261
            pix1 += line_size;
262
            p2_l  = np2_l;
263
            p2_r  = np2_r;
264

    
265
        } while (--h);
266
    } else {
267
        uint64_t p2_l, p2_r;
268
        p2_l = ldq(pix2);
269
        p2_r = ldq(pix2 + 8);
270
        do {
271
            uint64_t p1_l, p1_r, np2_l, np2_r;
272

    
273
            p1_l = ldq(pix1);
274
            p1_r = ldq(pix1 + 8);
275
            pix2 += line_size;
276
            np2_l = ldq(pix2);
277
            np2_r = ldq(pix2 + 8);
278

    
279
            result += perr(p1_l, avg2(p2_l, np2_l))
280
                    + perr(p1_r, avg2(p2_r, np2_r));
281

    
282
            pix1 += line_size;
283
            p2_l  = np2_l;
284
            p2_r  = np2_r;
285
        } while (--h);
286
    }
287
    return result;
288
}
289

    
290
int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
291
{
292
    int result = 0;
293

    
294
    uint64_t p1_l, p1_r;
295
    uint64_t p2_l, p2_r, p2_x;
296

    
297
    p1_l = ldq(pix1);
298
    p1_r = ldq(pix1 + 8);
299

    
300
    if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
301
        p2_l = uldq(pix2);
302
        p2_r = uldq(pix2 + 8);
303
        p2_x = (uint64_t) pix2[16] << 56;
304
    } else {
305
        p2_l = ldq(pix2);
306
        p2_r = ldq(pix2 + 8);
307
        p2_x = ldq(pix2 + 16) << 56;
308
    }
309

    
310
    do {
311
        uint64_t np1_l, np1_r;
312
        uint64_t np2_l, np2_r, np2_x;
313

    
314
        pix1 += line_size;
315
        pix2 += line_size;
316

    
317
        np1_l = ldq(pix1);
318
        np1_r = ldq(pix1 + 8);
319

    
320
        if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
321
            np2_l = uldq(pix2);
322
            np2_r = uldq(pix2 + 8);
323
            np2_x = (uint64_t) pix2[16] << 56;
324
        } else {
325
            np2_l = ldq(pix2);
326
            np2_r = ldq(pix2 + 8);
327
            np2_x = ldq(pix2 + 16) << 56;
328
        }
329

    
330
        result += perr(p1_l,
331
                       avg4( p2_l, ( p2_l >> 8) | ((uint64_t)  p2_r << 56),
332
                            np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
333
                + perr(p1_r,
334
                       avg4( p2_r, ( p2_r >> 8) | ((uint64_t)  p2_x),
335
                            np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
336

    
337
        p1_l = np1_l;
338
        p1_r = np1_r;
339
        p2_l = np2_l;
340
        p2_r = np2_r;
341
        p2_x = np2_x;
342
    } while (--h);
343

    
344
    return result;
345
}