Statistics
| Branch: | Revision:

ffmpeg / libavcodec / alpha / dsputil_alpha_asm.S @ 186447f8

History | View | Annotate | Download (7.15 KB)

1
/*
2
 * Alpha optimized DSP utils
3
 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4
 *
5
 * This program is free software; you can redistribute it and/or modify
6
 * it under the terms of the GNU General Public License as published by
7
 * the Free Software Foundation; either version 2 of the License, or
8
 * (at your option) any later version.
9
 *
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 * GNU General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, write to the Free Software
17
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
 */
19

    
20
/*
21
 * These functions are scheduled for pca56. They should work
22
 * reasonably on ev6, though.
23
 */
24

    
25
#include "regdef.h"
26
#ifdef HAVE_AV_CONFIG_H	
27
#include "config.h"
28
#endif
29

    
30
/* Some nicer register names.  */
31
#define ta t10
32
#define tb t11
33
#define tc t12
34
#define td AT
35
/* Danger: these overlap with the argument list and the return value */
36
#define te a5
37
#define tf a4
38
#define tg a3
39
#define th v0
40
                
41
        .set noat
42
        .set noreorder
43
        .arch pca56
44
        .text
45

    
46
/************************************************************************
47
 * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
48
 *                         int line_size, int h)
49
 */
50
        .align 6
51
        .globl put_pixels_axp_asm
52
        .ent put_pixels_axp_asm
53
put_pixels_axp_asm:
54
        .frame sp, 0, ra
55
        .prologue 0
56

    
57
#ifdef HAVE_GPROF
58
        lda     AT, _mcount
59
        jsr     AT, (AT), _mcount
60
#endif
61

    
62
        and     a1, 7, t0
63
        beq     t0, $aligned
64

    
65
        .align 4
66
$unaligned:
67
        ldq_u   t0, 0(a1)
68
        ldq_u   t1, 8(a1)
69
        addq    a1, a2, a1
70
        nop
71

    
72
        ldq_u   t2, 0(a1)
73
        ldq_u   t3, 8(a1)
74
        addq    a1, a2, a1
75
        nop
76

    
77
	ldq_u   t4, 0(a1)
78
        ldq_u   t5, 8(a1)
79
        addq    a1, a2, a1
80
        nop
81

    
82
        ldq_u   t6, 0(a1)
83
        ldq_u   t7, 8(a1)
84
        extql   t0, a1, t0
85
        addq    a1, a2, a1
86

    
87
        extqh   t1, a1, t1
88
        addq    a0, a2, t8
89
        extql   t2, a1, t2
90
        addq    t8, a2, t9
91

    
92
        extqh   t3, a1, t3
93
        addq    t9, a2, ta
94
        extql   t4, a1, t4
95
        or      t0, t1, t0
96

    
97
        extqh   t5, a1, t5
98
        or      t2, t3, t2
99
        extql   t6, a1, t6
100
        or      t4, t5, t4
101

    
102
        extqh   t7, a1, t7
103
        or      t6, t7, t6
104
        stq     t0, 0(a0)
105
        stq     t2, 0(t8)
106

    
107
        stq     t4, 0(t9)
108
        subq    a3, 4, a3
109
        stq     t6, 0(ta)
110
        addq    ta, a2, a0
111

    
112
        bne     a3, $unaligned
113
        ret
114

    
115
        .align 4
116
$aligned:
117
        ldq     t0, 0(a1)
118
        addq    a1, a2, a1
119
        ldq     t1, 0(a1)
120
        addq    a1, a2, a1
121

    
122
        ldq     t2, 0(a1)
123
        addq    a1, a2, a1
124
        ldq     t3, 0(a1)
125

    
126
	addq	a0, a2, t4
127
	addq    a1, a2, a1
128
	addq	t4, a2, t5
129
	subq    a3, 4, a3
130

    
131
	stq	t0, 0(a0)
132
	addq	t5, a2, t6
133
	stq	t1, 0(t4)
134
	addq	t6, a2, a0
135

    
136
	stq	t2, 0(t5)
137
	stq	t3, 0(t6)
138
	
139
	bne     a3, $aligned
140
        ret
141
        .end put_pixels_axp_asm
142

    
143
/************************************************************************
144
 * void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
145
 *                                 int line_size)
146
 */
147
        .align 6
148
        .globl put_pixels_clamped_mvi_asm
149
        .ent put_pixels_clamped_mvi_asm
150
put_pixels_clamped_mvi_asm:
151
        .frame sp, 0, ra
152
        .prologue 0
153

    
154
#ifdef HAVE_GPROF
155
        lda     AT, _mcount
156
        jsr     AT, (AT), _mcount
157
#endif
158

    
159
        lda     t8, -1
160
        lda     t9, 8           # loop counter
161
        zap     t8, 0xaa, t8    # 00ff00ff00ff00ff
162

    
163
        .align 4
164
1:      ldq     t0,  0(a0)
165
        ldq     t1,  8(a0)
166
        ldq     t2, 16(a0)
167
        ldq     t3, 24(a0)
168

    
169
        maxsw4  t0, zero, t0
170
        subq    t9, 2, t9
171
        maxsw4  t1, zero, t1
172
        lda     a0, 32(a0)
173

    
174
        maxsw4  t2, zero, t2
175
        addq    a1, a2, ta
176
        maxsw4  t3, zero, t3
177
        minsw4  t0, t8, t0
178
        
179
        minsw4  t1, t8, t1
180
        minsw4  t2, t8, t2
181
        minsw4  t3, t8, t3
182
        pkwb    t0, t0
183
        
184
        pkwb    t1, t1
185
        pkwb    t2, t2
186
        pkwb    t3, t3
187
        stl     t0, 0(a1)
188
        
189
        stl     t1, 4(a1)
190
        addq    ta, a2, a1
191
        stl     t2, 0(ta)
192
        stl     t3, 4(ta)
193

    
194
        bne     t9, 1b
195
        ret
196
        .end put_pixels_clamped_mvi_asm
197

    
198
/************************************************************************
199
 * void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
200
 *                                 int line_size)
201
 */
202
        .align 6
203
        .globl add_pixels_clamped_mvi_asm
204
        .ent add_pixels_clamped_mvi_asm
205
add_pixels_clamped_mvi_asm:
206
        .frame sp, 0, ra
207
        .prologue 0
208

    
209
#ifdef HAVE_GPROF
210
        lda     AT, _mcount
211
        jsr     AT, (AT), _mcount
212
#endif
213

    
214
        lda     t1, -1
215
        lda     th, 8
216
        zap     t1, 0x33, tg
217
        nop
218

    
219
        srl     tg, 1, t0
220
        xor     tg, t0, tg      # 0x8000800080008000
221
        zap     t1, 0xaa, tf    # 0x00ff00ff00ff00ff
222

    
223
        .align 4
224
1:      ldl     t1, 0(a1)       # pix0 (try to hit cache line soon)
225
        ldl     t4, 4(a1)       # pix1
226
        addq    a1, a2, te      # pixels += line_size
227
        ldq     t0, 0(a0)       # shorts0
228

    
229
        ldl     t7, 0(te)       # pix2 (try to hit cache line soon)
230
        ldl     ta, 4(te)       # pix3
231
        ldq     t3, 8(a0)       # shorts1
232
        ldq     t6, 16(a0)      # shorts2
233

    
234
        ldq     t9, 24(a0)      # shorts3
235
        unpkbw  t1, t1          # 0 0 (quarter/op no.)
236
        and     t0, tg, t2      # 0 1
237
        unpkbw  t4, t4          # 1 0
238

    
239
        bic     t0, tg, t0      # 0 2
240
        unpkbw  t7, t7          # 2 0
241
        and     t3, tg, t5      # 1 1
242
        addq    t0, t1, t0      # 0 3 
243

    
244
        xor     t0, t2, t0      # 0 4
245
        unpkbw  ta, ta          # 3 0
246
        and     t6, tg, t8      # 2 1
247
        maxsw4  t0, zero, t0    # 0 5
248
        
249
        bic     t3, tg, t3      # 1 2
250
        bic     t6, tg, t6      # 2 2
251
        minsw4  t0, tf, t0      # 0 6
252
        addq    t3, t4, t3      # 1 3
253
        
254
        pkwb    t0, t0          # 0 7
255
        xor     t3, t5, t3      # 1 4
256
        maxsw4  t3, zero, t3    # 1 5
257
        addq    t6, t7, t6      # 2 3
258

    
259
        xor     t6, t8, t6      # 2 4
260
        and     t9, tg, tb      # 3 1
261
        minsw4  t3, tf, t3      # 1 6
262
        bic     t9, tg, t9      # 3 2
263

    
264
        maxsw4  t6, zero, t6    # 2 5
265
        addq    t9, ta, t9      # 3 3
266
        stl     t0, 0(a1)       # 0 8   
267
        minsw4  t6, tf, t6      # 2 6
268

    
269
        xor     t9, tb, t9      # 3 4
270
        maxsw4  t9, zero, t9    # 3 5
271
        lda     a0, 32(a0)      # block += 16;
272
        pkwb    t3, t3          # 1 7
273
        
274
        minsw4  t9, tf, t9      # 3 6
275
        subq    th, 2, th
276
        pkwb    t6, t6          # 2 7
277
        pkwb    t9, t9          # 3 7
278

    
279
        stl     t3, 4(a1)       # 1 8
280
        addq    te, a2, a1      # pixels += line_size
281
        stl     t6, 0(te)       # 2 8
282
        stl     t9, 4(te)       # 3 8
283

    
284
        bne     th, 1b
285
        ret     
286
        .end add_pixels_clamped_mvi_asm