Revision bb7d4939

View differences:

libavcodec/Makefile
63 63
# alpha specific stuff
64 64
ifeq ($(TARGET_ARCH_ALPHA),yes)
65 65
OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o
66
ASM_OBJS += alpha/dsputil_alpha_asm.o
66 67
CFLAGS += -Wa,-mpca56
67 68
endif
68 69

  
69
SRCS := $(OBJS:.o=.c) $(ASM_OBJS:.o=.s)
70
SRCS := $(OBJS:.o=.c) $(ASM_OBJS:.o=.S)
70 71
OBJS := $(OBJS) $(ASM_OBJS)
71 72

  
72 73
LIB= libavcodec.a
libavcodec/alpha/dsputil_alpha.c
22 22

  
23 23
void simple_idct_axp(DCTELEM *block);
24 24

  
25
static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, 
26
				   int line_size)
25
void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
26
				int line_size);
27
void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
28
				int line_size);
29

  
30
#if 0
31
/* These functions were the base for the optimized assembler routines,
32
   and remain here for documentation purposes.  */
33
static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 
34
                                   int line_size)
27 35
{
28 36
    int i = 8;
37
    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
29 38

  
30 39
    ASM_ACCEPT_MVI;
31 40

  
32 41
    do {
33
	UINT64 shorts;
42
        uint64_t shorts0, shorts1;
34 43

  
35
	shorts = ldq(block);
36
	shorts = maxsw4(shorts, 0);
37
	shorts = minsw4(shorts, WORD_VEC(0x00ff));
38
	stl(pkwb(shorts), pixels);
44
        shorts0 = ldq(block);
45
        shorts0 = maxsw4(shorts0, 0);
46
        shorts0 = minsw4(shorts0, clampmask);
47
        stl(pkwb(shorts0), pixels);
39 48

  
40
	shorts = ldq(block + 4);
41
	shorts = maxsw4(shorts, 0);
42
	shorts = minsw4(shorts, WORD_VEC(0x00ff));
43
	stl(pkwb(shorts), pixels + 4);
49
        shorts1 = ldq(block + 4);
50
        shorts1 = maxsw4(shorts1, 0);
51
        shorts1 = minsw4(shorts1, clampmask);
52
        stl(pkwb(shorts1), pixels + 4);
44 53

  
45
	pixels += line_size;
46
	block += 8;
54
        pixels += line_size;
55
        block += 8;
47 56
    } while (--i);
48 57
}
49 58

  
50
static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, 
51
				   int line_size)
59
void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 
60
                            int line_size)
52 61
{
53
    int i = 8;
62
    int h = 8;
63
    /* Keep this function a leaf function by generating the constants
64
       manually (mainly for the hack value ;-).  */
65
    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
66
    uint64_t signmask  = zap(-1, 0x33);
67
    signmask ^= signmask >> 1;  /* 0x8000800080008000 */
54 68

  
55 69
    ASM_ACCEPT_MVI;
56 70

  
57 71
    do {
58
	UINT64 shorts; 
59

  
60
	shorts = ldq(block);
61
	shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */
62
	shorts += unpkbw(ldl(pixels));
63
	shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */
64
	shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */
65
	shorts &= ~WORD_VEC(0x4000); /* ...and zap them */
66
	shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */
67
	stl(pkwb(shorts), pixels);
68

  
69
	/* next 4 */
70
	shorts = ldq(block + 4);
71
	shorts &= ~WORD_VEC(0x8000);
72
	shorts += unpkbw(ldl(pixels + 4));
73
	shorts &= ~WORD_VEC(0x8000);
74
	shorts = minuw4(shorts, WORD_VEC(0x4000));
75
	shorts &= ~WORD_VEC(0x4000);
76
	shorts = minsw4(shorts, WORD_VEC(0x00ff));
77
	stl(pkwb(shorts), pixels + 4);
78

  
79
	pixels += line_size;
80
	block += 8;
81
    } while (--i);
72
        uint64_t shorts0, pix0, signs0;
73
        uint64_t shorts1, pix1, signs1;
74

  
75
        shorts0 = ldq(block);
76
        shorts1 = ldq(block + 4);
77

  
78
        pix0    = unpkbw(ldl(pixels));
79
        /* Signed subword add (MMX paddw).  */
80
        signs0  = shorts0 & signmask;
81
        shorts0 &= ~signmask;
82
        shorts0 += pix0;
83
        shorts0 ^= signs0;
84
        /* Clamp. */
85
        shorts0 = maxsw4(shorts0, 0);
86
        shorts0 = minsw4(shorts0, clampmask);   
87

  
88
        /* Next 4.  */
89
        pix1    = unpkbw(ldl(pixels + 4));
90
        signs1  = shorts1 & signmask;
91
        shorts1 &= ~signmask;
92
        shorts1 += pix1;
93
        shorts1 ^= signs1;
94
        shorts1 = maxsw4(shorts1, 0);
95
        shorts1 = minsw4(shorts1, clampmask);
96

  
97
        stl(pkwb(shorts0), pixels);
98
        stl(pkwb(shorts1), pixels + 4);
99

  
100
        pixels += line_size;
101
        block += 8;
102
    } while (--h);
82 103
}
104
#endif
83 105

  
84 106
/* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
85 107
   Since the immediate result could be greater than 255, we do the
......
222 244

  
223 245
    /* amask clears all bits that correspond to present features.  */
224 246
    if (amask(AMASK_MVI) == 0) {
225
	put_pixels_clamped = put_pixels_clamped_axp;
226
	add_pixels_clamped = add_pixels_clamped_axp;
247
        put_pixels_clamped = put_pixels_clamped_mvi_asm;
248
        add_pixels_clamped = add_pixels_clamped_mvi_asm;
227 249
    }
228 250
}
libavcodec/alpha/dsputil_alpha_asm.S
1
/*
2
 * Alpha optimized DSP utils
3
 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4
 *
5
 * This program is free software; you can redistribute it and/or modify
6
 * it under the terms of the GNU General Public License as published by
7
 * the Free Software Foundation; either version 2 of the License, or
8
 * (at your option) any later version.
9
 *
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 * GNU General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, write to the Free Software
17
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
 */
19

  
20
/*
21
 * These functions are scheduled for pca56. They should work
22
 * reasonably on ev6, though.
23
 */
24

  
25
#include "regdef.h"
26

  
27
/* Some nicer register names.  */
28
#define ta t10
29
#define tb t11
30
#define tc t12
31
#define td AT
32
/* Danger: these overlap with the argument list and the return value */
33
#define te a5
34
#define tf a4
35
#define tg a3
36
#define th v0
37
                
38
        .set noat
39
        .set noreorder
40
        .arch pca56
41
        .text
42

  
43
/************************************************************************
44
 * void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
45
 *                                 int line_size)
46
 */
47
        .align 6
48
        .globl put_pixels_clamped_mvi_asm
49
        .ent put_pixels_clamped_mvi_asm
50
put_pixels_clamped_mvi_asm:
51
        .frame sp, 0, ra
52
        .prologue 0
53

  
54
        lda     t8, -1
55
        lda     t9, 8           # loop counter
56
        zap     t8, 0xaa, t8    # 00ff00ff00ff00ff
57

  
58
        .align 4
59
1:      ldq     t0,  0(a0)
60
        ldq     t1,  8(a0)
61
        ldq     t2, 16(a0)
62
        ldq     t3, 24(a0)
63

  
64
        maxsw4  t0, zero, t0
65
        subq    t9, 2, t9
66
        maxsw4  t1, zero, t1
67
        lda     a0, 32(a0)
68

  
69
        maxsw4  t2, zero, t2
70
        addq    a1, a2, ta
71
        maxsw4  t3, zero, t3
72
        minsw4  t0, t8, t0
73
        
74
        minsw4  t1, t8, t1
75
        minsw4  t2, t8, t2
76
        minsw4  t3, t8, t3
77
        pkwb    t0, t0
78
        
79
        pkwb    t1, t1
80
        pkwb    t2, t2
81
        pkwb    t3, t3
82
        stl     t0, 0(a1)
83
        
84
        stl     t1, 4(a1)
85
        addq    ta, a2, a1
86
        stl     t2, 0(ta)
87
        stl     t3, 4(ta)
88

  
89
        bne     t9, 1b
90
        ret
91
        .end put_pixels_clamped_mvi_asm
92

  
93
/************************************************************************
94
 * void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
95
 *                                 int line_size)
96
 */
97
        .align 6
98
        .globl add_pixels_clamped_mvi_asm
99
        .ent add_pixels_clamped_mvi_asm
100
add_pixels_clamped_mvi_asm:
101
        .frame sp, 0, ra
102
        .prologue 0
103

  
104
        lda     t1, -1
105
        lda     th, 8
106
        zap     t1, 0x33, tg
107
        nop
108

  
109
        srl     tg, 1, t0
110
        xor     tg, t0, tg      # 0x8000800080008000
111
        zap     t1, 0xaa, tf    # 0x00ff00ff00ff00ff
112

  
113
        .align 4
114
1:      ldl     t1, 0(a1)       # pix0 (try to hit cache line soon)
115
        ldl     t4, 4(a1)       # pix1
116
        addq    a1, a2, te      # pixels += line_size
117
        ldq     t0, 0(a0)       # shorts0
118

  
119
        ldl     t7, 0(te)       # pix2 (try to hit cache line soon)
120
        ldl     ta, 4(te)       # pix3
121
        ldq     t3, 8(a0)       # shorts1
122
        ldq     t6, 16(a0)      # shorts2
123

  
124
        ldq     t9, 24(a0)      # shorts3
125
        unpkbw  t1, t1          # 0 0 (quarter/op no.)
126
        and     t0, tg, t2      # 0 1
127
        unpkbw  t4, t4          # 1 0
128

  
129
        bic     t0, tg, t0      # 0 2
130
        unpkbw  t7, t7          # 2 0
131
        and     t3, tg, t5      # 1 1
132
        addq    t0, t1, t0      # 0 3 
133

  
134
        xor     t0, t2, t0      # 0 4
135
        unpkbw  ta, ta          # 3 0
136
        and     t6, tg, t8      # 2 1
137
        maxsw4  t0, zero, t0    # 0 5
138
        
139
        bic     t3, tg, t3      # 1 2
140
        bic     t6, tg, t6      # 2 2
141
        minsw4  t0, tf, t0      # 0 6
142
        addq    t3, t4, t3      # 1 3
143
        
144
        pkwb    t0, t0          # 0 7
145
        xor     t3, t5, t3      # 1 4
146
        maxsw4  t3, zero, t3    # 1 5
147
        addq    t6, t7, t6      # 2 3
148

  
149
        xor     t6, t8, t6      # 2 4
150
        and     t9, tg, tb      # 3 1
151
        minsw4  t3, tf, t3      # 1 6
152
        bic     t9, tg, t9      # 3 2
153

  
154
        maxsw4  t6, zero, t6    # 2 5
155
        addq    t9, ta, t9      # 3 3
156
        stl     t0, 0(a1)       # 0 8   
157
        minsw4  t6, tf, t6      # 2 6
158

  
159
        xor     t9, tb, t9      # 3 4
160
        maxsw4  t9, zero, t9    # 3 5
161
        lda     a0, 32(a0)      # block += 16;
162
        pkwb    t3, t3          # 1 7
163
        
164
        minsw4  t9, tf, t9      # 3 6
165
        subq    th, 2, th
166
        pkwb    t6, t6          # 2 7
167
        pkwb    t9, t9          # 3 7
168

  
169
        stl     t3, 4(a1)       # 1 8
170
        addq    te, a2, a1      # pixels += line_size
171
        stl     t6, 0(te)       # 2 8
172
        stl     t9, 4(te)       # 3 8
173

  
174
        bne     th, 1b
175
        ret     
176
        .end add_pixels_clamped_mvi_asm
libavcodec/alpha/regdef.h
1
/* Some BSDs don't seem to have regdef.h... sigh  */
2
#ifndef alpha_regdef_h
3
#define alpha_regdef_h
4

  
5
#define v0      $0      /* function return value */
6

  
7
#define t0      $1      /* temporary registers (caller-saved) */
8
#define t1      $2
9
#define t2      $3
10
#define t3      $4
11
#define t4      $5
12
#define t5      $6
13
#define t6      $7
14
#define t7      $8
15

  
16
#define s0      $9      /* saved-registers (callee-saved registers) */
17
#define s1      $10
18
#define s2      $11
19
#define s3      $12
20
#define s4      $13
21
#define s5      $14
22
#define s6      $15
23
#define fp      s6      /* frame-pointer (s6 in frame-less procedures) */
24

  
25
#define a0      $16     /* argument registers (caller-saved) */
26
#define a1      $17
27
#define a2      $18
28
#define a3      $19
29
#define a4      $20
30
#define a5      $21
31

  
32
#define t8      $22     /* more temps (caller-saved) */
33
#define t9      $23
34
#define t10     $24
35
#define t11     $25
36
#define ra      $26     /* return address register */
37
#define t12     $27
38

  
39
#define pv      t12     /* procedure-variable register */
40
#define AT      $at     /* assembler temporary */
41
#define gp      $29     /* global pointer */
42
#define sp      $30     /* stack pointer */
43
#define zero    $31     /* reads as zero, writes are noops */
44

  
45
#endif /* alpha_regdef_h */

Also available in: Unified diff