Statistics
| Branch: | Revision:

ffmpeg / libavcodec / bfin / pixels_bfin.S @ 2912e87a

History | View | Annotate | Download (23.7 KB)

1
/*
2
 * Blackfin Pixel Operations
3
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
4
 *
5
 * This file is part of Libav.
6
 *
7
 * Libav is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * Libav is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with Libav; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21
#include "config_bfin.h"
22

    
23
DEFUN(put_pixels_clamped,mL1,
24
        (DCTELEM *block, uint8_t *dest, int line_size)):
25
    [--SP] = (R7:4);
26
    R4 = 0;
27
    R5.l = 0x00ff;
28
    R5.h = 0x00ff;
29
    I0 = R0;         // block
30
    I1 = R1;         // dest
31
    R2 += -4;        // line_size
32
    M1 = R2;
33
    P0 = 8;
34
    R0 = [I0++];
35
    R1 = [I0++];
36
    R2 = MAX(R0, R4) (V);
37
    LSETUP (ppc$0,ppc$1) LC0=P0;
38
ppc$0: R2 = MIN(R2, R5) (V);
39
       R3 = MAX(R1, R4) (V);
40
       R3 = MIN(R3, R5) (V)      || R0 = [I0++];
41
       R6 = BYTEPACK (R2,R3)     || R1 = [I0++];
42
       R2 = MAX(R0, R4) (V)      || [I1++] = R6;
43
       R2 = MIN(R2, R5) (V);
44
       R3 = MAX(R1, R4) (V);
45
       R3 = MIN(R3, R5) (V)      || R0 = [I0++];
46
       R6 = BYTEPACK (R2,R3)     || R1 = [I0++];
47
ppc$1: R2 = Max(R0, R4) (V)      || [I1++M1] = R6;
48

    
49
    (R7:4) = [SP++];
50
    RTS;
51
DEFUN_END(put_pixels_clamped)
52

    
53
DEFUN(add_pixels_clamped,mL1,
54
        (DCTELEM *block, uint8_t *dest, int line_size)):
55
    [-- SP] = (R7:4);
56
    R4 = 0;
57
    I0 = 0;
58
    R2 += -4;        // line_size
59
    M0 = R2;
60
    I1 = R1;         // dest
61
    I3 = R0;         // block
62
    I2 = R1;         // dest
63
    P0 = 8;
64
    M3 = 2;
65
    R0 = [I3++]  || R2 = [I1];
66
    R2 = R2 << 8                      || R0.H = W[I3--]  || R3 = [I1++];
67
    R3 = R3 >> 8                      || R1.L = W[I3]    || I3 += 4;
68
    R6 = BYTEOP3P(R1:0, R3:2) (LO)    || R1.H = W[I3++]  || R2 = [I1];
69

    
70
    LSETUP(apc$2,apc$3) LC1 = P0;
71
apc$2: R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++]     || R3 = [I1++M0];
72
       R2 = R2 << 8                      || R0.H = W[I3--];
73
       R3 = R3 >> 8                      || R1.L = W[I3]    || I3 += 4;
74
       R6 = R6 + R7 (S)                  || R1.H = W[I3];
75
       R6 = BYTEOP3P(R1:0, R3:2) (LO)    || I3+=M3          || [I2++]=R6;
76
       R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++]     || R2 = [I1];
77
       R2 = R2 << 8                      || R0.H = W[I3--]  || R3 = [I1++];
78
       R3 = R3 >> 8                      || R1.L = W[I3]    || I3 += 4;
79
       R6 = R6 + R7 (S)                  || R1.H = W[I3++];
80
apc$3: R6 = BYTEOP3P(R1:0, R3:2) (LO)    || [I2++M0] = R6   || R2 = [I1];
81

    
82
    (R7:4) = [SP++];
83
    RTS;
84
DEFUN_END(add_pixels_clamped)
85

    
86

    
87
/*
88
  motion compensation
89
  primitives
90

    
91
     * Halfpel motion compensation with rounding (a+b+1)>>1.
92
     * This is an array[4][4] of motion compensation funcions for 4
93
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
94
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
95
     * @param block destination where the result is stored
96
     * @param pixels source
97
     * @param line_size number of bytes in a horizontal line of block
98
     * @param h height
99

    
100
*/
101

    
102
DEFUN(put_pixels8uc,mL1,
103
        (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
104
                 int dest_size, int line_size, int h)):
105
        i3=r0;        // dest
106
        i0=r1;        // src0
107
        i1=r2;        // src1
108
        r0=[sp+12];   // dest_size
109
        r2=[sp+16];   // line_size
110
        p0=[sp+20];   // h
111
        [--sp] = (r7:6);
112
        r0+=-4;
113
        m3=r0;
114
        r2+=-8;
115
        m0=r2;
116
        LSETUP(pp8$0,pp8$1) LC0=P0;
117
        DISALGNEXCPT                || R0 = [I0++]  || R2  =[I1++];
118

    
119
pp8$0:  DISALGNEXCPT                || R1 = [I0++]  || R3  =[I1++];
120
        R6 = BYTEOP1P(R1:0,R3:2)    || R0 = [I0++M0]|| R2  =[I1++M0];
121
        R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++]  || [I3++] = R6 ;
122
pp8$1:  DISALGNEXCPT                || R2 = [I1++]  || [I3++M3] = R7;
123

    
124
        (r7:6) = [sp++];
125
        RTS;
126
DEFUN_END(put_pixels8uc)
127

    
128
DEFUN(put_pixels16uc,mL1,
129
        (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
130
                 int dest_size, int line_size, int h)):
131
        link 0;
132
        [--sp] = (r7:6);
133
        i3=r0;        // dest
134
        i0=r1;        // src0
135
        i1=r2;        // src1
136
        r0=[fp+20];   // dest_size
137
        r2=[fp+24];   // line_size
138
        p0=[fp+28];   // h
139

    
140

    
141
        r0+=-12;
142
        m3=r0;        // line_size
143
        r2+=-16;
144
        m0=r2;
145

    
146
        LSETUP(pp16$0,pp16$1) LC0=P0;
147
         DISALGNEXCPT                || R0 = [I0++]   || R2  =[I1++];
148

    
149
pp16$0:  DISALGNEXCPT                || R1 = [I0++]   || R3  =[I1++];
150
         R6 = BYTEOP1P(R1:0,R3:2)    || R0 = [I0++]   || R2  =[I1++];
151
         R7 = BYTEOP1P(R1:0,R3:2)(R) || R1 = [I0++]   || R3  =[I1++];
152
         [I3++] = R6;
153
         R6 = BYTEOP1P(R1:0,R3:2)    || R0 = [I0++M0] || R2  =[I1++M0];
154
         R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++]   || [I3++] = R7 ;
155
         [I3++] = R6;
156
pp16$1:  DISALGNEXCPT                || R2 = [I1++]   || [I3++M3] = R7;
157

    
158
        (r7:6) = [sp++];
159
        unlink;
160
        RTS;
161
DEFUN_END(put_pixels16uc)
162

    
163

    
164

    
165

    
166

    
167

    
168
DEFUN(put_pixels8uc_nornd,mL1,
169
        (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
170
                 int line_size, int h)):
171
        i3=r0;        // dest
172
        i0=r1;        // src0
173
        i1=r2;        // src1
174
        r2=[sp+12];   // line_size
175
        p0=[sp+16];   // h
176
        [--sp] = (r7:6);
177
        r2+=-4;
178
        m3=r2;
179
        r2+=-4;
180
        m0=r2;
181
        LSETUP(pp8$2,pp8$3) LC0=P0;
182
        DISALGNEXCPT                || R0 = [I0++]  || R2  =[I1++];
183

    
184
pp8$2:  DISALGNEXCPT                || R1 = [I0++]  || R3  =[I1++];
185
        R6 = BYTEOP1P(R1:0,R3:2)(T)  || R0 = [I0++M0]|| R2  =[I1++M0];
186
        R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++]  || [I3++] = R6 ;
187
pp8$3:  DISALGNEXCPT                || R2 = [I1++]  || [I3++M3] = R7;
188

    
189
        (r7:6) = [sp++];
190
        RTS;
191
DEFUN_END(put_pixels8uc_nornd)
192

    
193
DEFUN(put_pixels16uc_nornd,mL1,
194
        (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
195
                 int line_size, int h)):
196
        i3=r0;        // dest
197
        i0=r1;        // src0
198
        i1=r2;        // src1
199
        r2=[sp+12];   // line_size
200
        p0=[sp+16];   // h
201

    
202
        [--sp] = (r7:6);
203
        r2+=-12;
204
        m3=r2;        // line_size
205
        r2+=-4;
206
        m0=r2;
207

    
208
        LSETUP(pp16$2,pp16$3) LC0=P0;
209
        DISALGNEXCPT                || R0 = [I0++]   || R2  =[I1++];
210

    
211
pp16$2:
212
        DISALGNEXCPT                || R1 = [I0++]   || R3  =[I1++];
213
        R6 = BYTEOP1P(R1:0,R3:2)(T)    || R0 = [I0++]   || R2  =[I1++];
214
        R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R1 = [I0++]   || R3  =[I1++];
215
        [I3++] = R6;
216

    
217
        R6 = BYTEOP1P(R1:0,R3:2)(T)    || R0 = [I0++M0] || R2  =[I1++M0];
218
        R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++]   || [I3++] = R7 ;
219
        [I3++] = R6;
220
pp16$3: DISALGNEXCPT                || R2 = [I1++]   || [I3++M3] = R7;
221

    
222
        (r7:6) = [sp++];
223

    
224
        RTS;
225
DEFUN_END(put_pixels16uc_nornd)
226

    
227
DEFUN(z_put_pixels16_xy2,mL1,
228
        (uint8_t *block, const uint8_t *s0,
229
                 int dest_size, int line_size, int h)):
230
        link 0;
231
        [--sp] = (r7:4);
232
        i3=r0;        // dest
233
        i0=r1;        // src0--> pixels
234
        i1=r1;        // src1--> pixels + line_size
235
        r2+=-12;
236
        m2=r2;        // m2=dest_width-4
237
        r2=[fp+20];
238
        m3=r2;        // line_size
239
        p0=[fp+24];   // h
240
        r2+=-16;
241
        i1+=m3;       /* src1 + line_size */
242
        m0=r2;        /* line-size - 20 */
243

    
244
        B0 = I0;
245
        B1 = I1;
246
        B3 = I3;
247

    
248
        DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
249

    
250
        LSETUP(LS$16E,LE$16E) LC0=P0;
251
LS$16E: DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
252
        R4 = BYTEOP2P (R3:2,R1:0) (RNDL)   || R0 = [I0++] || R2  =[I1++];
253
        R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R1 = [I0++] || [I3++] = R4 ;
254
        DISALGNEXCPT                       || R3 = [I1++] || [I3++] = R5;
255
        R4 = BYTEOP2P (R3:2,R1:0) (RNDL)   || R0 = [I0++M0]|| R2  = [I1++M0];
256
        R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++] || [I3++] = R4 ;
257
LE$16E: DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;
258

    
259
        M1 = 1;
260
        I3 = B3;
261
        I1 = B1;
262
        I0 = B0;
263

    
264
        I0 += M1;
265
        I1 += M1;
266

    
267
        DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
268
        LSETUP(LS$16O,LE$16O) LC0=P0;
269
LS$16O: DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
270
        R4 = BYTEOP2P (R3:2,R1:0) (RNDH)   || R0 = [I0++] || R2  =[I1++];
271
        R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R1 = [I0++] || R6  =[I3++];
272
        R4 = R4 +|+ R6                       || R7 = [I3--];
273
        R5 = R5 +|+ R7                       || [I3++] = R4;
274
        DISALGNEXCPT                       || R3  =[I1++] || [I3++] = R5;
275
        R4 = BYTEOP2P (R3:2,R1:0) (RNDH)   || R0 = [I0++M0]|| R2  = [I1++M0];
276
        R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++] || R6 = [I3++];
277
        R4 = R4 +|+ R6                       || R7 = [I3--];
278
        R5 = R5 +|+ R7                       || [I3++] = R4;
279
LE$16O: DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;
280

    
281
        (r7:4) = [sp++];
282
        unlink;
283
        rts;
284
DEFUN_END(z_put_pixels16_xy2)
285

    
286
DEFUN(put_pixels16_xy2_nornd,mL1,
287
        (uint8_t *block, const uint8_t *s0,
288
                 int line_size, int h)):
289
        link 0;
290
        [--sp] = (r7:4);
291
        i3=r0;        // dest
292
        i0=r1;        // src0--> pixels
293
        i1=r1;        // src1--> pixels + line_size
294
        m3=r2;
295
        r2+=-12;
296
        m2=r2;
297
        r2+=-4;
298
        i1+=m3;       /* src1 + line_size */
299
        m0=r2;        /* line-size - 20 */
300
        p0=[fp+20];   // h
301

    
302
        B0=I0;
303
        B1=I1;
304
        B3=I3;
305

    
306
        DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
307

    
308
        LSETUP(LS$16ET,LE$16ET) LC0=P0;
309
LS$16ET:DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
310
        R4 = BYTEOP2P (R3:2,R1:0) (TL)     || R0 = [I0++] || R2  =[I1++];
311
        R5 = BYTEOP2P (R3:2,R1:0) (TL,R)   || R1 = [I0++] || [I3++] = R4 ;
312
        DISALGNEXCPT                       || R3 = [I1++] || [I3++] = R5;
313
        R4 = BYTEOP2P (R3:2,R1:0) (TL)     || R0 = [I0++M0]|| R2  = [I1++M0];
314
        R5 = BYTEOP2P (R3:2,R1:0) (TL,R)   || R0 = [I0++] || [I3++] = R4 ;
315
LE$16ET:DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;
316

    
317
        M1 = 1;
318
        I3=B3;
319
        I1=B1;
320
        I0=B0;
321

    
322
        I0 += M1;
323
        I1 += M1;
324

    
325
        DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
326
        LSETUP(LS$16OT,LE$16OT) LC0=P0;
327
LS$16OT:DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
328
        R4 = BYTEOP2P (R3:2,R1:0) (TH)     || R0 = [I0++] || R2  =[I1++];
329
        R5 = BYTEOP2P (R3:2,R1:0) (TH,R)   || R1 = [I0++] || R6  =[I3++];
330
        R4 = R4 +|+ R6                                    || R7 = [I3--];
331
        R5 = R5 +|+ R7                                    || [I3++] = R4;
332
        DISALGNEXCPT                       || R3  =[I1++] || [I3++] = R5;
333
        R4 = BYTEOP2P (R3:2,R1:0) (TH)     || R0 = [I0++M0]|| R2  = [I1++M0];
334
        R5 = BYTEOP2P (R3:2,R1:0) (TH,R)   || R0 = [I0++] || R6 = [I3++];
335
        R4 = R4 +|+ R6                                    || R7 = [I3--];
336
        R5 = R5 +|+ R7                                    || [I3++] = R4;
337
LE$16OT:DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;
338

    
339
        (r7:4) = [sp++];
340
        unlink;
341
        rts;
342
DEFUN_END(put_pixels16_xy2_nornd)
343

    
344
DEFUN(z_put_pixels8_xy2,mL1,
345
        (uint8_t *block, const uint8_t *s0,
346
                 int dest_size, int line_size, int h)):
347
        link 0;
348
        [--sp] = (r7:4);
349
        i3=r0;        // dest
350
        i0=r1;        // src0--> pixels
351
        i1=r1;        // src1--> pixels + line_size
352
        r2+=-4;
353
        m2=r2;        // m2=dest_width-4
354
        r2=[fp+20];
355
        m3=r2;        // line_size
356
        p0=[fp+24];   // h
357
        r2+=-8;
358
        i1+=m3;       /* src1 + line_size */
359
        m0=r2;        /* line-size - 20 */
360

    
361
        b0 = I0;
362
        b1 = I1;
363
        b3 = I3;
364

    
365
        LSETUP(LS$8E,LE$8E) LC0=P0;
366
        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
367
LS$8E:  DISALGNEXCPT                       || R1 = [I0++]   || R3  =[I1++];
368
        R4 = BYTEOP2P (R3:2,R1:0) (RNDL)   || R0 = [I0++M0] || R2  =[I1++M0];
369
        R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++]   || [I3++] = R4 ;
370
LE$8E:  DISALGNEXCPT                       || R2 = [I1++]   || [I3++M2] = R5;
371

    
372
        M1 = 1;
373
        I3 = b3;
374
        I1 = b1;
375
        I0 = b0;
376

    
377
        I0 += M1;
378
        I1 += M1;
379

    
380
        LSETUP(LS$8O,LE$8O) LC0=P0;
381
        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
382
LS$8O:  DISALGNEXCPT                       || R1 = [I0++]   || R3  =[I1++];
383
        R4 = BYTEOP2P (R3:2,R1:0) (RNDH)   || R0 = [I0++M0] || R2  =[I1++M0];
384
        R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++]   || R6  =[I3++];
385
        R4 = R4 +|+ R6                                      || R7 = [I3--];
386
        R5 = R5 +|+ R7                                      || [I3++] = R4;
387
LE$8O:  DISALGNEXCPT                       || R2  =[I1++]   || [I3++M2] = R5;
388

    
389
        (r7:4) = [sp++];
390
        unlink;
391
        rts;
392
DEFUN_END(z_put_pixels8_xy2)
393

    
394
DEFUN(put_pixels8_xy2_nornd,mL1,
395
        (uint8_t *block, const uint8_t *s0, int line_size, int h)):
396
        link 0;
397
        [--sp] = (r7:4);
398
        i3=r0;        // dest
399
        i0=r1;        // src0--> pixels
400
        i1=r1;        // src1--> pixels + line_size
401
        m3=r2;
402
        r2+=-4;
403
        m2=r2;
404
        r2+=-4;
405
        i1+=m3;       /* src1 + line_size */
406
        m0=r2;        /* line-size - 20 */
407
        p0=[fp+20];   // h
408

    
409

    
410
        b0 = I0;
411
        b1 = I1;
412
        b3 = I3;
413

    
414
        LSETUP(LS$8ET,LE$8ET) LC0=P0;
415
        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
416

    
417
LS$8ET: DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
418
        R4 = BYTEOP2P (R3:2,R1:0) (TL)     || R0 = [I0++M0] || R2 = [I1++M0];
419
        R5 = BYTEOP2P (R3:2,R1:0) (TL,R)   || R0 = [I0++]   || [I3++] = R4 ;
420
LE$8ET: DISALGNEXCPT                       || R2 = [I1++]   || [I3++M2] = R5;
421

    
422
        M1 = 1;
423
        I3 = b3;
424
        I1 = b1;
425
        I0 = b0;
426

    
427
        I0 += M1;
428
        I1 += M1;
429

    
430
        LSETUP(LS$8OT,LE$8OT) LC0=P0;
431
        DISALGNEXCPT                       || R0 = [I0++]   || R2 = [I1++];
432

    
433
LS$8OT: DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
434
        R4 = BYTEOP2P (R3:2,R1:0) (TH)     || R0 = [I0++M0] || R2 = [I1++M0];
435
        R5 = BYTEOP2P (R3:2,R1:0) (TH,R)   || R0 = [I0++]   || R6 = [I3++];
436
        R4 = R4 +|+ R6                                      || R7 = [I3--];
437
        R5 = R5 +|+ R7                                      || [I3++] = R4;
438
LE$8OT: DISALGNEXCPT                       || R2  =[I1++]   || [I3++M2] = R5;
439

    
440
        (r7:4) = [sp++];
441
        unlink;
442
        rts;
443

    
444
DEFUN(diff_pixels,mL1,
445
       (DCTELEM *block, uint8_t *s1, uint8_t *s2, int stride)):
446
        link 0;
447
        [--sp] = (r7:4);
448
        p0=8;
449
        i3=r0;        // block
450
        i0=r1;        // s1
451
        i1=r2;        // s2
452
        r2=[fp+20];   // stride
453
        r2+=-8;
454
        m0=r2;
455

    
456

    
457
        LSETUP(.LS0,.LE0) LC0=P0;
458
        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
459

    
460
.LS0:   DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
461
        (R5,R4) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++M0] || R2 = [I1++M0];
462
        (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++]   || [I3++] = R4;
463
        DISALGNEXCPT                       || R2 = [I1++]   || [I3++] = R5;
464
        [i3++]=r6;
465
.LE0:  [i3++]=r7;
466

    
467
        (r7:4) = [sp++];
468
        unlink;
469
        rts;
470
DEFUN_END(put_pixels8_xy2_nornd)
471

    
472
/*
473
    for (i = 0; i < 16; i++) {
474
        for (j = 0; j < 16; j++) {
475
          sum += pix[j];
476
        }
477
        pix += line_size;
478
    }
479
*/
480
DEFUN(pix_sum,mL1,
481
        (uint8_t *p, int stride)):
482
        link 0;
483
        [--sp] = (r7:4);
484
        p0=8;
485
        i0=r0;        // s1
486
        i1=r0;
487
        m1=r1;
488
        r1=r1+r1;
489
        r1+=-16;       // stride
490
        m0=r1;
491
        i1+=m1;
492

    
493
        r6=0;
494

    
495
        LSETUP(LS$PS,LE$PS) LC0=P0;
496
        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
497

    
498
LS$PS:  DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
499
        (R5,R4) = BYTEOP16P (R3:2,R1:0)    || R0 = [I0++]   || R2 = [I1++];
500
        r6=r6+|+r5;
501
        r6=r6+|+r4;
502
        (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R1 = [I0++]   || R3 = [I1++];
503
        r6=r6+|+r5;
504
        r6=r6+|+r4;
505
        (R5,R4) = BYTEOP16P (R3:2,R1:0)    || R0 = [I0++m0] || R2 = [I1++m0];
506
        r6=r6+|+r5;
507
        r6=r6+|+r4;
508
        (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R0 = [I0++]   || R2 = [I1++];
509
        r6=r6+|+r5;
510
LE$PS:  r6=r6+|+r4;
511
        r0.l=r6.l+r6.h;
512
        r0.h=0;
513

    
514
        (r7:4) = [sp++];
515
        unlink;
516
        rts;
517
DEFUN_END(pix_sum)
518

    
519

    
520
DEFUN(get_pixels,mL1,
521
        (DCTELEM *restrict block, const uint8_t *pixels, int line_size)):
522
        [--sp] = (r7:4);
523
        i3=r0;        // dest
524
        i0=r1;        // src0
525
        p0=8;
526
        r2+=-8;
527
        m0=r2;
528
        LSETUP(gp8$0,gp8$1) LC0=P0;
529

    
530
        DISALGNEXCPT                   || R0 = [I0++];
531
        DISALGNEXCPT                   || R1 = [I0++];
532

    
533
gp8$0:  (R7,R6) = byteunpack R1:0      || R0 = [I0++M0];
534
        (R5,R4) = byteunpack R1:0 (R)  || R0 = [I0++]    || [I3++]=R6;
535
        DISALGNEXCPT                   || R1 = [I0++]    || [I3++]=R7;
536
        [I3++]=R4;
537
gp8$1:  [I3++]=R5
538

    
539

    
540
        (r7:4) = [sp++];
541
        RTS;
542
DEFUN_END(get_pixels)
543

    
544

    
545
/* sad = sad16x16 (ubyte *mb, ubyte *refwin, srcwidth, refwinwidth, h) */
546
/* 91 cycles */
547
DEFUN(z_sad16x16,mL1,
548
        (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)):
549
        link 0;
550
        I0 = R0;
551
        I1 = R1;
552

    
553
        A1 = A0 = 0;
554
        R0 = [sp+20]; // rwidth
555
        P2 = [sp+24]; // height
556
        R3 = 16;
557
        R0 = R0 - R3;
558
        R3 = R2 - R3;
559
        M1 = R0;
560
        M0 = R3;
561

    
562
        DISALGNEXCPT         || R0 = [I0++]    || R2 = [I1++];
563
        LSETUP (s$16, e$16) LC0=P2;
564
s$16:   DISALGNEXCPT         || R1 = [I0++]    || R3 = [I1++];
565
        SAA (R1:0,R3:2)      || R0 = [I0++]    || R2 = [I1++];
566
        SAA (R1:0,R3:2) (R)  || R1 = [I0++]    || R3 = [I1++];
567
        SAA (R1:0,R3:2)      || R0 = [I0++M0]  || R2 = [I1++M1];
568
e$16:   SAA (R1:0,R3:2) (R)  || R0 = [I0++]    || R2 = [I1++];
569

    
570
        R3=A1.L+A1.H,  R2=A0.L+A0.H ;
571
        R0 = R2 + R3 ;
572
        unlink;
573
        RTS;
574
DEFUN_END(z_sad16x16)
575

    
576
/* sad = sad8x8 (ubyte *mb, ubyte *refwin, int srcwidth, int refwinwidth, int h) */
577
/* 36 cycles */
578
DEFUN(z_sad8x8,mL1,
579
        (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)):
580
        I0 = R0;
581
        I1 = R1;
582

    
583
        A1 = A0 = 0;
584
        r0 = [sp+12]; // rwidth
585
        P2 = [sp+16]; //height
586
        R3 = 8;
587
        R0 = R0 - R3;
588
        R3 = R2 - R3;
589
        M0 = R3;
590
        M1 = R0;
591

    
592
        LSETUP (s$8, e$8) LC0=P2;
593
        DISALGNEXCPT         || R0 = [I0++]   || R2 = [I1++];
594
        DISALGNEXCPT         || R1 = [I0++]   || R3 = [I1++];
595
s$8:    SAA (R1:0,R3:2)      || R0 = [I0++M0] || R2 = [I1++M1];
596
        SAA (R1:0,R3:2) (R)  || R0 = [I0++]   || R2 = [I1++];
597
e$8:    DISALGNEXCPT         || R1 = [I0++]   || R3 = [I1++];
598

    
599
        R3=A1.L+A1.H,  R2=A0.L+A0.H ;
600
        R0 = R2 + R3 ;
601
        RTS;
602
DEFUN_END(z_sad8x8)
603

    
604
DEFUN(pix_norm1,mL1,
605
        (uint8_t * pix, int line_size)):
606
        [--SP]=(R7:4,P5:3);
607

    
608
        // Fetch the input arguments.
609
        P1 = R0;  // pix
610
        P0 = R1;  // line_size
611
        P5 = 16;  // loop ctr.
612
        P0 -= P5;
613
        M0 = P0;  // M0 = line_size-16;
614
        // Now for the real work.
615
        A1 = A0 = 0;
616
        lsetup(_pix_norm1_blkfn_loopStart, _pix_norm1_blkfn_loopEnd) LC1 = P5;
617
        I0 = P1;
618
        DISALGNEXCPT || r0 = [i0++];
619

    
620
_pix_norm1_blkfn_loopStart:
621
        // following unpacks pix1[0..15] pix1+line_size[0..15]
622
        DISALGNEXCPT || r1 = [i0++];
623

    
624
        (r5, r4) = byteunpack r1:0 || r0 = [i0++];
625
        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
626
        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
627
        (r5, r4) = byteunpack r1:0(r) || r1 = [i0++];
628
        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
629
        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
630
        (r5, r4) = byteunpack r1:0 || r0 = [i0++M0];
631
        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
632
        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
633
        (r5, r4) = byteunpack r1:0(r) || r0 = [i0++];
634
        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
635
_pix_norm1_blkfn_loopEnd:
636
        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
637

    
638

    
639
// Clean up at the end:
640
        R2 = A0, R3 = A1;
641
        R0 = R2 + R3 (S);
642

    
643
        (R7:4,P5:3)=[SP++];
644

    
645
        RTS;
646
DEFUN_END(pix_norm1)
647

    
648
DEFUN(sse4,mL1,
649
        (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
650
        link 0;
651
        [--sp] = (r7:6);
652
        p0=[fp+24];   // h
653
        i0=r1;        // pix1
654
        i1=r2;        // pix2
655
        r2=[fp+20];   // line_size
656
        r2+=-4;
657
        m0=r2;
658

    
659
        a0=a1=0;
660
        LSETUP(.S40,.E40) LC0=P0;
661
        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
662

    
663
.S40:   DISALGNEXCPT                       || R1 = [I0++M0] || R3 = [I1++M0];
664
        (R7,R6) = BYTEOP16M (R1:0,R3:2);
665
        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
666
.E40:   a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
667
        a0 += a1;
668
        r0 = a0;
669

    
670
        (r7:6) = [sp++];
671
        unlink;
672
        rts;
673
DEFUN_END(sse4)
674

    
675
DEFUN(sse8,mL1,
676
        (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
677
        link 0;
678
        [--sp] = (r7:6);
679
        p0=[fp+24];   // h
680
        i0=r1;        // pix1
681
        i1=r2;        // pix2
682
        r2=[fp+20];   // line_size
683
        r2+=-8;
684
        m0=r2;
685

    
686
        a0=a1=0;
687
        LSETUP(.S80,.E80) LC0=P0;
688
        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
689

    
690
.S80:   DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
691
        (R7,R6) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++M0] || R2 = [I1++M0];
692
        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
693
        a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
694
        (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++]   || R2 = [I1++];
695
        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
696
.E80:   a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
697
        a0 += a1;
698
        r0 = a0;
699

    
700
        (r7:6) = [sp++];
701
        unlink;
702
        rts;
703
DEFUN_END(sse8)
704

    
705
DEFUN(sse16,mL1,
706
        (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
707
        link 0;
708
        [--sp] = (r7:6);
709
        p0=[fp+24];   // h
710
        i0=r1;        // pix1
711
        i1=r2;        // pix2
712
        r2=[fp+20];   // line_size
713
        r2+=-16;
714
        m0=r2;
715

    
716
        a0=a1=0;
717
        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
718
        LSETUP(.S160,.E160) LC0=P0;
719

    
720
.S160:  DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
721
        (R7,R6) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++]   || R2 = [I1++];
722
        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
723
        a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
724
        (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R1 = [I0++]   || R3 = [I1++];
725
        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
726
        a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
727
        (R7,R6) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++M0] || R2 = [I1++M0];
728
        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
729
        a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
730
        (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++]   || R2 = [I1++];
731
        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
732
.E160:  a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
733
        a0 += a1;
734
        r0 = a0;
735

    
736
        (r7:6) = [sp++];
737
        unlink;
738
        rts;
739
DEFUN_END(sse16)
740

    
741