Revision 67fd620c libavcodec/bfin/dsputil_bfin.c

View differences:

libavcodec/bfin/dsputil_bfin.c
1 1
/*
2
 * Copyright (c) 2006 Michael Benjamin
2
 * BlackFin DSPUTILS
3
 *
4
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
5
 * Copyright (c) 2006 Michael Benjamin <michael.benjamin@analog.com>
3 6
 *
4 7
 * This file is part of FFmpeg.
5 8
 *
......
18 21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 22
 */
20 23

  
24
#include <unistd.h>
25
#include <bits/bfin_sram.h>
21 26
#include "../avcodec.h"
22 27
#include "../dsputil.h"
23 28

  
24
static int sad8x8_bfin( void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h )
25
{
26
    int sum;
27
    __asm__ __volatile__ (
28
    "P0 = %1;" // blk1
29
    "P1 = %2;" // blk2
30
    "P2 = %3;\n" // h
31
    "I0 = P0;"
32
    "I1 = P1;\n"
33
    "A0 = 0;"
34
    "A1 = 0;\n"
35
    "M0 = P2;\n"
36
    "P3 = 32;\n"
37
    "LSETUP (sad8x8LoopBegin, sad8x8LoopEnd) LC0=P3;\n"
38
    "sad8x8LoopBegin:\n"
39
    "  DISALGNEXCPT || R0 = [I0] || R2 = [I1];\n"
40
    "  DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];\n"
41
    "sad8x8LoopEnd:\n"
42
    "  SAA ( R1:0 , R3:2 );\n"
43
    "R3 = A1.L + A1.H, R2 = A0.L + A0.H;\n"
44
    "%0 = R2 + R3 (S);\n"
45
    : "=&d" (sum)
46
    : "m"(blk1), "m"(blk2), "m"(h)
47
    : "P0","P1","P2","I0","I1","A0","A1","R0","R1","R2","R3");
48
    return sum;
29
#define USE_L1CODE
30

  
31
#ifdef USE_L1CODE
32
#define L1CODE __attribute__ ((l1_text))
33
#else
34
#define L1CODE
35
#endif
36
int off;
37

  
38

  
39
extern void ff_bfin_idct (DCTELEM *block) L1CODE;
40
extern void ff_bfin_fdct (DCTELEM *block) L1CODE;
41
extern void ff_bfin_add_pixels_clamped (DCTELEM *block, uint8_t *dest, int line_size) L1CODE;
42
extern void ff_bfin_put_pixels_clamped (DCTELEM *block, uint8_t *dest, int line_size) L1CODE;
43
extern void ff_bfin_diff_pixels (DCTELEM *block, uint8_t *s1, uint8_t *s2, int stride)  L1CODE;
44
extern void ff_bfin_get_pixels  (DCTELEM *restrict block, const uint8_t *pixels, int line_size) L1CODE;
45
extern int  ff_bfin_pix_norm1  (uint8_t * pix, int line_size) L1CODE;
46
extern int  ff_bfin_z_sad8x8   (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h) L1CODE;
47
extern int  ff_bfin_z_sad16x16 (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h) L1CODE;
48

  
49
extern void ff_bfin_z_put_pixels16_xy2     (uint8_t *block, const uint8_t *s0, int dest_size, int line_size, int h) L1CODE;
50
extern void ff_bfin_z_put_pixels8_xy2      (uint8_t *block, const uint8_t *s0, int dest_size, int line_size, int h) L1CODE;
51
extern void ff_bfin_put_pixels16_xy2_nornd (uint8_t *block, const uint8_t *s0, int line_size, int h) L1CODE;
52
extern void ff_bfin_put_pixels8_xy2_nornd  (uint8_t *block, const uint8_t *s0, int line_size, int h) L1CODE;
53

  
54

  
55
extern int  ff_bfin_pix_sum (uint8_t *p, int stride) L1CODE;
56

  
57
extern void ff_bfin_put_pixels8uc        (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int dest_size, int line_size, int h) L1CODE;
58
extern void ff_bfin_put_pixels16uc       (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int dest_size, int line_size, int h) L1CODE;
59
extern void ff_bfin_put_pixels8uc_nornd  (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int line_size, int h) L1CODE;
60
extern void ff_bfin_put_pixels16uc_nornd (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int line_size, int h) L1CODE;
61

  
62
extern int ff_bfin_sse4  (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) L1CODE;
63
extern int ff_bfin_sse8  (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) L1CODE;
64
extern int ff_bfin_sse16 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) L1CODE;
65

  
66

  
67
#if 0
68
void pblk (uint8_t *p, int w, int h, int s)
69
{
70
    int i,j;
71
    av_log (0,0,"0x%08x:\n", p);
72
    for (i = 0;i<h;i++) {
73
        for (j=0;j<w;j++)
74
            av_log (0,0,"%3d ", p[j]);
75
        p+=s;
76
        av_log (0,0,"\n");
77
    }
78
    av_log (0,0,"\n");
79
}
80
#endif
81

  
82
static void bfin_idct_add (uint8_t *dest, int line_size, DCTELEM *block)
83
{
84
    ff_bfin_idct (block);
85
    ff_bfin_add_pixels_clamped (block, dest, line_size);
86
}
87

  
88
static void bfin_idct_put (uint8_t *dest, int line_size, DCTELEM *block)
89
{
90
    ff_bfin_idct (block);
91
    ff_bfin_put_pixels_clamped (block, dest, line_size);
92
}
93

  
94

  
95
static void bfin_clear_blocks (DCTELEM *blocks)
96
{
97
    // This is just a simple memset.
98
    //
99
    asm("P0=192; "
100
        "I0=%0;  "
101
        "R0=0;   "
102
        "LSETUP(clear_blocks_blkfn_lab,clear_blocks_blkfn_lab)LC0=P0;"
103
        "clear_blocks_blkfn_lab:"
104
        "[I0++]=R0;"
105
        ::"a" (blocks):"P0","I0","R0");
106
}
107

  
108

  
109

  
110
static void bfin_put_pixels8 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
111
{
112
    ff_bfin_put_pixels8uc (block, pixels, pixels, line_size, line_size, h);
113
}
114

  
115
static void bfin_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
116
{
117
    ff_bfin_put_pixels8uc (block, pixels, pixels+1, line_size, line_size, h);
118
}
119

  
120
static void bfin_put_pixels8_y2 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
121
{
122
    ff_bfin_put_pixels8uc (block, pixels, pixels+line_size, line_size, line_size, h);
123
}
124

  
125
static void bfin_put_pixels8_xy2 (uint8_t *block, const uint8_t *s0, int line_size, int h)
126
{
127
    ff_bfin_z_put_pixels8_xy2 (block,s0,line_size, line_size, h);
128
}
129

  
130
static void bfin_put_pixels16 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
131
{
132
    ff_bfin_put_pixels16uc (block, pixels, pixels, line_size, line_size, h);
133
}
134

  
135
static void bfin_put_pixels16_x2 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
136
{
137
    ff_bfin_put_pixels16uc (block, pixels, pixels+1, line_size, line_size, h);
138
}
139

  
140
static void bfin_put_pixels16_y2 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
141
{
142
    ff_bfin_put_pixels16uc (block, pixels, pixels+line_size, line_size, line_size, h);
143
}
144

  
145
static void bfin_put_pixels16_xy2 (uint8_t *block, const uint8_t *s0, int line_size, int h)
146
{
147
    ff_bfin_z_put_pixels16_xy2 (block,s0,line_size, line_size, h);
148
}
149

  
150
void bfin_put_pixels8_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
151
{
152
    ff_bfin_put_pixels8uc_nornd (block, pixels, pixels, line_size, h);
153
}
154

  
155
static void bfin_put_pixels8_x2_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
156
{
157
    ff_bfin_put_pixels8uc_nornd (block, pixels, pixels+1, line_size, h);
158
}
159

  
160
static void bfin_put_pixels8_y2_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
161
{
162
    ff_bfin_put_pixels8uc_nornd (block, pixels, pixels+line_size, line_size, h);
49 163
}
50 164

  
165

  
166
void bfin_put_pixels16_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
167
{
168
    ff_bfin_put_pixels16uc_nornd (block, pixels, pixels, line_size, h);
169
}
170

  
171
static void bfin_put_pixels16_x2_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
172
{
173
    ff_bfin_put_pixels16uc_nornd (block, pixels, pixels+1, line_size, h);
174
}
175

  
176
static void bfin_put_pixels16_y2_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
177
{
178
    ff_bfin_put_pixels16uc_nornd (block, pixels, pixels+line_size, line_size, h);
179
}
180

  
181
static int bfin_pix_abs16 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
182
{
183
    return ff_bfin_z_sad16x16 (blk1,blk2,line_size,line_size,h);
184
}
185

  
186
static uint8_t vtmp_blk[256] __attribute__((l1_data_B));
187

  
188
static int bfin_pix_abs16_x2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
189
{
190
    ff_bfin_put_pixels16uc (vtmp_blk, blk2, blk2+1, 16, line_size, h);
191
    return ff_bfin_z_sad16x16 (blk1, vtmp_blk, line_size, 16, h);
192
}
193

  
194
static int bfin_pix_abs16_y2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
195
{
196
    ff_bfin_put_pixels16uc (vtmp_blk, blk2, blk2+line_size, 16, line_size, h);
197
    return ff_bfin_z_sad16x16 (blk1, vtmp_blk, line_size, 16, h);
198
}
199

  
200
static int bfin_pix_abs16_xy2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
201
{
202
    ff_bfin_z_put_pixels16_xy2 (vtmp_blk, blk2, 16, line_size, h);
203
    return ff_bfin_z_sad16x16 (blk1, vtmp_blk, line_size, 16, h);
204
}
205

  
206
static int bfin_pix_abs8 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
207
{
208
    return ff_bfin_z_sad8x8 (blk1,blk2,line_size,line_size, h);
209
}
210

  
211
static int bfin_pix_abs8_x2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
212
{
213
    ff_bfin_put_pixels8uc (vtmp_blk, blk2, blk2+1, 8, line_size, h);
214
    return ff_bfin_z_sad8x8 (blk1, vtmp_blk, line_size, 8, h);
215
}
216

  
217
static int bfin_pix_abs8_y2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
218
{
219
    ff_bfin_put_pixels8uc (vtmp_blk, blk2, blk2+line_size, 8, line_size, h);
220
    return ff_bfin_z_sad8x8 (blk1, vtmp_blk, line_size, 8, h);
221
}
222

  
223
static int bfin_pix_abs8_xy2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
224
{
225
    ff_bfin_z_put_pixels8_xy2 (vtmp_blk, blk2, 8, line_size, h);
226
    return ff_bfin_z_sad8x8 (blk1, vtmp_blk, line_size, 8, h);
227
}
228

  
229

  
230
/*
231
  decoder optimization
232
  start on 2/11 100 frames of 352x240@25 compiled with no optimization -g debugging
233
  9.824s ~ 2.44x off
234
  6.360s ~ 1.58x off with -O2
235
  5.740s ~ 1.43x off with idcts
236

  
237
  2.64s    2/20 same sman.mp4 decode only
238

  
239
*/
240

  
51 241
void dsputil_init_bfin( DSPContext* c, AVCodecContext *avctx )
52 242
{
53
    c->pix_abs[1][0] = sad8x8_bfin;
54
    c->sad[1] = sad8x8_bfin;
243
    c->get_pixels         = ff_bfin_get_pixels;
244
    c->diff_pixels        = ff_bfin_diff_pixels;
245
    c->put_pixels_clamped = ff_bfin_put_pixels_clamped;
246
    c->add_pixels_clamped = ff_bfin_add_pixels_clamped;
247

  
248
    c->clear_blocks       = bfin_clear_blocks;
249
    c->pix_sum            = ff_bfin_pix_sum;
250
    c->pix_norm1          = ff_bfin_pix_norm1;
251

  
252
    c->sad[0]             = bfin_pix_abs16;
253
    c->sad[1]             = bfin_pix_abs8;
254

  
255
    /* TODO [0] 16  [1] 8 */
256
    c->pix_abs[0][0] = bfin_pix_abs16;
257
    c->pix_abs[0][1] = bfin_pix_abs16_x2;
258
    c->pix_abs[0][2] = bfin_pix_abs16_y2;
259
    c->pix_abs[0][3] = bfin_pix_abs16_xy2;
260

  
261
    c->pix_abs[1][0] = bfin_pix_abs8;
262
    c->pix_abs[1][1] = bfin_pix_abs8_x2;
263
    c->pix_abs[1][2] = bfin_pix_abs8_y2;
264
    c->pix_abs[1][3] = bfin_pix_abs8_xy2;
265

  
266

  
267
    c->sse[0] = ff_bfin_sse16;
268
    c->sse[1] = ff_bfin_sse8;
269
    c->sse[2] = ff_bfin_sse4;
270

  
271

  
272
    /**
273
     * Halfpel motion compensation with rounding (a+b+1)>>1.
274
     * This is an array[4][4] of motion compensation functions for 4
275
     * horizontal blocksizes (8,16) and the 4 halfpel positions
276
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
277
     * @param block destination where the result is stored
278
     * @param pixels source
279
     * @param line_size number of bytes in a horizontal line of block
280
     * @param h height
281
     */
282

  
283
    c->put_pixels_tab[0][0] = bfin_put_pixels16;
284
    c->put_pixels_tab[0][1] = bfin_put_pixels16_x2;
285
    c->put_pixels_tab[0][2] = bfin_put_pixels16_y2;
286
    c->put_pixels_tab[0][3] = bfin_put_pixels16_xy2;
287

  
288
    c->put_pixels_tab[1][0] = bfin_put_pixels8;
289
    c->put_pixels_tab[1][1] = bfin_put_pixels8_x2;
290
    c->put_pixels_tab[1][2] = bfin_put_pixels8_y2;
291
    c->put_pixels_tab[1][3] = bfin_put_pixels8_xy2;
292

  
293
    c->put_no_rnd_pixels_tab[1][0] = bfin_put_pixels8_nornd;
294
    c->put_no_rnd_pixels_tab[1][1] = bfin_put_pixels8_x2_nornd;
295
    c->put_no_rnd_pixels_tab[1][2] = bfin_put_pixels8_y2_nornd;
296
    c->put_no_rnd_pixels_tab[1][3] = ff_bfin_put_pixels8_xy2_nornd;
297

  
298
    c->put_no_rnd_pixels_tab[0][0] = bfin_put_pixels16_nornd;
299
    c->put_no_rnd_pixels_tab[0][1] = bfin_put_pixels16_x2_nornd;
300
    c->put_no_rnd_pixels_tab[0][2] = bfin_put_pixels16_y2_nornd;
301
    c->put_no_rnd_pixels_tab[0][3] = ff_bfin_put_pixels16_xy2_nornd;
302

  
303
    c->fdct               = ff_bfin_fdct;
304
    c->idct               = ff_bfin_idct;
305
    c->idct_add           = bfin_idct_add;
306
    c->idct_put           = bfin_idct_put;
55 307
}
308

  
309

  
310

  

Also available in: Unified diff