Statistics
| Branch: | Revision:

ffmpeg / libavcodec / alpha / dsputil_alpha_asm.S @ ab35de18

History | View | Annotate | Download (7.09 KB)

1
/*
2
 * Alpha optimized DSP utils
3
 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4
 *
5
 * This program is free software; you can redistribute it and/or modify
6
 * it under the terms of the GNU General Public License as published by
7
 * the Free Software Foundation; either version 2 of the License, or
8
 * (at your option) any later version.
9
 *
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 * GNU General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, write to the Free Software
17
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
 */
19

    
20
/*
21
 * These functions are scheduled for pca56. They should work
22
 * reasonably on ev6, though.
23
 */
24

    
25
#include "regdef.h"
26

    
27
/* Some nicer register names.  */
28
#define ta t10
29
#define tb t11
30
#define tc t12
31
#define td AT
32
/* Danger: these overlap with the argument list and the return value */
33
#define te a5
34
#define tf a4
35
#define tg a3
36
#define th v0
37
                
38
        .set noat
39
        .set noreorder
40
        .arch pca56
41
        .text
42

    
43
/************************************************************************
44
 * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
45
 *                         int line_size, int h)
46
 */
47
        .align 6
48
        .globl put_pixels_axp_asm
49
        .ent put_pixels_axp_asm
50
put_pixels_axp_asm:
51
        .frame sp, 0, ra
52
        .prologue 0
53

    
54
#ifdef HAVE_GPROF
55
        lda     AT, _mcount
56
        jsr     AT, (AT), _mcount
57
#endif
58

    
59
        and     a1, 7, t0
60
        beq     t0, $aligned
61

    
62
        .align 4
63
$unaligned:
64
        ldq_u   t0, 0(a1)
65
        ldq_u   t1, 8(a1)
66
        addq    a1, a2, a1
67
        nop
68

    
69
        ldq_u   t2, 0(a1)
70
        ldq_u   t3, 8(a1)
71
        addq    a1, a2, a1
72
        nop
73

    
74
	ldq_u   t4, 0(a1)
75
        ldq_u   t5, 8(a1)
76
        addq    a1, a2, a1
77
        nop
78

    
79
        ldq_u   t6, 0(a1)
80
        ldq_u   t7, 8(a1)
81
        extql   t0, a1, t0
82
        addq    a1, a2, a1
83

    
84
        extqh   t1, a1, t1
85
        addq    a0, a2, t8
86
        extql   t2, a1, t2
87
        addq    t8, a2, t9
88

    
89
        extqh   t3, a1, t3
90
        addq    t9, a2, ta
91
        extql   t4, a1, t4
92
        or      t0, t1, t0
93

    
94
        extqh   t5, a1, t5
95
        or      t2, t3, t2
96
        extql   t6, a1, t6
97
        or      t4, t5, t4
98

    
99
        extqh   t7, a1, t7
100
        or      t6, t7, t6
101
        stq     t0, 0(a0)
102
        stq     t2, 0(t8)
103

    
104
        stq     t4, 0(t9)
105
        subq    a3, 4, a3
106
        stq     t6, 0(ta)
107
        addq    ta, a2, a0
108

    
109
        bne     a3, $unaligned
110
        ret
111

    
112
        .align 4
113
$aligned:
114
        ldq     t0, 0(a1)
115
        addq    a1, a2, a1
116
        ldq     t1, 0(a1)
117
        addq    a1, a2, a1
118

    
119
        ldq     t2, 0(a1)
120
        addq    a1, a2, a1
121
        ldq     t3, 0(a1)
122

    
123
	addq	a0, a2, t4
124
	addq    a1, a2, a1
125
	addq	t4, a2, t5
126
	subq    a3, 4, a3
127

    
128
	stq	t0, 0(a0)
129
	addq	t5, a2, t6
130
	stq	t1, 0(t4)
131
	addq	t6, a2, a0
132

    
133
	stq	t2, 0(t5)
134
	stq	t3, 0(t6)
135
	
136
	bne     a3, $aligned
137
        ret
138
        .end put_pixels_axp_asm
139

    
140
/************************************************************************
141
 * void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
142
 *                                 int line_size)
143
 */
144
        .align 6
145
        .globl put_pixels_clamped_mvi_asm
146
        .ent put_pixels_clamped_mvi_asm
147
put_pixels_clamped_mvi_asm:
148
        .frame sp, 0, ra
149
        .prologue 0
150

    
151
#ifdef HAVE_GPROF
152
        lda     AT, _mcount
153
        jsr     AT, (AT), _mcount
154
#endif
155

    
156
        lda     t8, -1
157
        lda     t9, 8           # loop counter
158
        zap     t8, 0xaa, t8    # 00ff00ff00ff00ff
159

    
160
        .align 4
161
1:      ldq     t0,  0(a0)
162
        ldq     t1,  8(a0)
163
        ldq     t2, 16(a0)
164
        ldq     t3, 24(a0)
165

    
166
        maxsw4  t0, zero, t0
167
        subq    t9, 2, t9
168
        maxsw4  t1, zero, t1
169
        lda     a0, 32(a0)
170

    
171
        maxsw4  t2, zero, t2
172
        addq    a1, a2, ta
173
        maxsw4  t3, zero, t3
174
        minsw4  t0, t8, t0
175
        
176
        minsw4  t1, t8, t1
177
        minsw4  t2, t8, t2
178
        minsw4  t3, t8, t3
179
        pkwb    t0, t0
180
        
181
        pkwb    t1, t1
182
        pkwb    t2, t2
183
        pkwb    t3, t3
184
        stl     t0, 0(a1)
185
        
186
        stl     t1, 4(a1)
187
        addq    ta, a2, a1
188
        stl     t2, 0(ta)
189
        stl     t3, 4(ta)
190

    
191
        bne     t9, 1b
192
        ret
193
        .end put_pixels_clamped_mvi_asm
194

    
195
/************************************************************************
196
 * void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
197
 *                                 int line_size)
198
 */
199
        .align 6
200
        .globl add_pixels_clamped_mvi_asm
201
        .ent add_pixels_clamped_mvi_asm
202
add_pixels_clamped_mvi_asm:
203
        .frame sp, 0, ra
204
        .prologue 0
205

    
206
#ifdef HAVE_GPROF
207
        lda     AT, _mcount
208
        jsr     AT, (AT), _mcount
209
#endif
210

    
211
        lda     t1, -1
212
        lda     th, 8
213
        zap     t1, 0x33, tg
214
        nop
215

    
216
        srl     tg, 1, t0
217
        xor     tg, t0, tg      # 0x8000800080008000
218
        zap     t1, 0xaa, tf    # 0x00ff00ff00ff00ff
219

    
220
        .align 4
221
1:      ldl     t1, 0(a1)       # pix0 (try to hit cache line soon)
222
        ldl     t4, 4(a1)       # pix1
223
        addq    a1, a2, te      # pixels += line_size
224
        ldq     t0, 0(a0)       # shorts0
225

    
226
        ldl     t7, 0(te)       # pix2 (try to hit cache line soon)
227
        ldl     ta, 4(te)       # pix3
228
        ldq     t3, 8(a0)       # shorts1
229
        ldq     t6, 16(a0)      # shorts2
230

    
231
        ldq     t9, 24(a0)      # shorts3
232
        unpkbw  t1, t1          # 0 0 (quarter/op no.)
233
        and     t0, tg, t2      # 0 1
234
        unpkbw  t4, t4          # 1 0
235

    
236
        bic     t0, tg, t0      # 0 2
237
        unpkbw  t7, t7          # 2 0
238
        and     t3, tg, t5      # 1 1
239
        addq    t0, t1, t0      # 0 3 
240

    
241
        xor     t0, t2, t0      # 0 4
242
        unpkbw  ta, ta          # 3 0
243
        and     t6, tg, t8      # 2 1
244
        maxsw4  t0, zero, t0    # 0 5
245
        
246
        bic     t3, tg, t3      # 1 2
247
        bic     t6, tg, t6      # 2 2
248
        minsw4  t0, tf, t0      # 0 6
249
        addq    t3, t4, t3      # 1 3
250
        
251
        pkwb    t0, t0          # 0 7
252
        xor     t3, t5, t3      # 1 4
253
        maxsw4  t3, zero, t3    # 1 5
254
        addq    t6, t7, t6      # 2 3
255

    
256
        xor     t6, t8, t6      # 2 4
257
        and     t9, tg, tb      # 3 1
258
        minsw4  t3, tf, t3      # 1 6
259
        bic     t9, tg, t9      # 3 2
260

    
261
        maxsw4  t6, zero, t6    # 2 5
262
        addq    t9, ta, t9      # 3 3
263
        stl     t0, 0(a1)       # 0 8   
264
        minsw4  t6, tf, t6      # 2 6
265

    
266
        xor     t9, tb, t9      # 3 4
267
        maxsw4  t9, zero, t9    # 3 5
268
        lda     a0, 32(a0)      # block += 16;
269
        pkwb    t3, t3          # 1 7
270
        
271
        minsw4  t9, tf, t9      # 3 6
272
        subq    th, 2, th
273
        pkwb    t6, t6          # 2 7
274
        pkwb    t9, t9          # 3 7
275

    
276
        stl     t3, 4(a1)       # 1 8
277
        addq    te, a2, a1      # pixels += line_size
278
        stl     t6, 0(te)       # 2 8
279
        stl     t9, 4(te)       # 3 8
280

    
281
        bne     th, 1b
282
        ret     
283
        .end add_pixels_clamped_mvi_asm