Revision bb7d4939 libavcodec/alpha/dsputil_alpha.c

View differences:

libavcodec/alpha/dsputil_alpha.c
22 22

  
23 23
void simple_idct_axp(DCTELEM *block);
24 24

  
25
static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, 
26
				   int line_size)
25
void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
26
				int line_size);
27
void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
28
				int line_size);
29

  
30
#if 0
31
/* These functions were the base for the optimized assembler routines,
32
   and remain here for documentation purposes.  */
33
static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 
34
                                   int line_size)
27 35
{
28 36
    int i = 8;
37
    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
29 38

  
30 39
    ASM_ACCEPT_MVI;
31 40

  
32 41
    do {
33
	UINT64 shorts;
42
        uint64_t shorts0, shorts1;
34 43

  
35
	shorts = ldq(block);
36
	shorts = maxsw4(shorts, 0);
37
	shorts = minsw4(shorts, WORD_VEC(0x00ff));
38
	stl(pkwb(shorts), pixels);
44
        shorts0 = ldq(block);
45
        shorts0 = maxsw4(shorts0, 0);
46
        shorts0 = minsw4(shorts0, clampmask);
47
        stl(pkwb(shorts0), pixels);
39 48

  
40
	shorts = ldq(block + 4);
41
	shorts = maxsw4(shorts, 0);
42
	shorts = minsw4(shorts, WORD_VEC(0x00ff));
43
	stl(pkwb(shorts), pixels + 4);
49
        shorts1 = ldq(block + 4);
50
        shorts1 = maxsw4(shorts1, 0);
51
        shorts1 = minsw4(shorts1, clampmask);
52
        stl(pkwb(shorts1), pixels + 4);
44 53

  
45
	pixels += line_size;
46
	block += 8;
54
        pixels += line_size;
55
        block += 8;
47 56
    } while (--i);
48 57
}
49 58

  
50
static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, 
51
				   int line_size)
59
void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 
60
                            int line_size)
52 61
{
53
    int i = 8;
62
    int h = 8;
63
    /* Keep this function a leaf function by generating the constants
64
       manually (mainly for the hack value ;-).  */
65
    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
66
    uint64_t signmask  = zap(-1, 0x33);
67
    signmask ^= signmask >> 1;  /* 0x8000800080008000 */
54 68

  
55 69
    ASM_ACCEPT_MVI;
56 70

  
57 71
    do {
58
	UINT64 shorts; 
59

  
60
	shorts = ldq(block);
61
	shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */
62
	shorts += unpkbw(ldl(pixels));
63
	shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */
64
	shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */
65
	shorts &= ~WORD_VEC(0x4000); /* ...and zap them */
66
	shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */
67
	stl(pkwb(shorts), pixels);
68

  
69
	/* next 4 */
70
	shorts = ldq(block + 4);
71
	shorts &= ~WORD_VEC(0x8000);
72
	shorts += unpkbw(ldl(pixels + 4));
73
	shorts &= ~WORD_VEC(0x8000);
74
	shorts = minuw4(shorts, WORD_VEC(0x4000));
75
	shorts &= ~WORD_VEC(0x4000);
76
	shorts = minsw4(shorts, WORD_VEC(0x00ff));
77
	stl(pkwb(shorts), pixels + 4);
78

  
79
	pixels += line_size;
80
	block += 8;
81
    } while (--i);
72
        uint64_t shorts0, pix0, signs0;
73
        uint64_t shorts1, pix1, signs1;
74

  
75
        shorts0 = ldq(block);
76
        shorts1 = ldq(block + 4);
77

  
78
        pix0    = unpkbw(ldl(pixels));
79
        /* Signed subword add (MMX paddw).  */
80
        signs0  = shorts0 & signmask;
81
        shorts0 &= ~signmask;
82
        shorts0 += pix0;
83
        shorts0 ^= signs0;
84
        /* Clamp. */
85
        shorts0 = maxsw4(shorts0, 0);
86
        shorts0 = minsw4(shorts0, clampmask);   
87

  
88
        /* Next 4.  */
89
        pix1    = unpkbw(ldl(pixels + 4));
90
        signs1  = shorts1 & signmask;
91
        shorts1 &= ~signmask;
92
        shorts1 += pix1;
93
        shorts1 ^= signs1;
94
        shorts1 = maxsw4(shorts1, 0);
95
        shorts1 = minsw4(shorts1, clampmask);
96

  
97
        stl(pkwb(shorts0), pixels);
98
        stl(pkwb(shorts1), pixels + 4);
99

  
100
        pixels += line_size;
101
        block += 8;
102
    } while (--h);
82 103
}
104
#endif
83 105

  
84 106
/* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
85 107
   Since the immediate result could be greater than 255, we do the
......
222 244

  
223 245
    /* amask clears all bits that correspond to present features.  */
224 246
    if (amask(AMASK_MVI) == 0) {
225
	put_pixels_clamped = put_pixels_clamped_axp;
226
	add_pixels_clamped = add_pixels_clamped_axp;
247
        put_pixels_clamped = put_pixels_clamped_mvi_asm;
248
        add_pixels_clamped = add_pixels_clamped_mvi_asm;
227 249
    }
228 250
}

Also available in: Unified diff