
1 
/*


2 
* VC1 and WMV3 decoder  DSP functions AltiVecoptimized


3 
* Copyright (c) 2006 Konstantin Shishkov


4 
*


5 
* This library is free software; you can redistribute it and/or


6 
* modify it under the terms of the GNU Lesser General Public


7 
* License as published by the Free Software Foundation; either


8 
* version 2 of the License, or (at your option) any later version.


9 
*


10 
* This library is distributed in the hope that it will be useful,


11 
* but WITHOUT ANY WARRANTY; without even the implied warranty of


12 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU


13 
* Lesser General Public License for more details.


14 
*


15 
* You should have received a copy of the GNU Lesser General Public


16 
* License along with this library; if not, write to the Free Software


17 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA


18 
*


19 
*/


20 


21 
#include "../dsputil.h"


22 


23 
#include "gcc_fixes.h"


24 


25 
#include "dsputil_altivec.h"


26 


27 
// Transpose 8x8 matrix of 16bit elements. Borrowed from mpegvideo_altivec.c


28 
#define TRANSPOSE8(a,b,c,d,e,f,g,h) \


29 
do { \


30 
vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \


31 
vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \


32 
\


33 
A1 = vec_mergeh (a, e); \


34 
B1 = vec_mergel (a, e); \


35 
C1 = vec_mergeh (b, f); \


36 
D1 = vec_mergel (b, f); \


37 
E1 = vec_mergeh (c, g); \


38 
F1 = vec_mergel (c, g); \


39 
G1 = vec_mergeh (d, h); \


40 
H1 = vec_mergel (d, h); \


41 
\


42 
A2 = vec_mergeh (A1, E1); \


43 
B2 = vec_mergel (A1, E1); \


44 
C2 = vec_mergeh (B1, F1); \


45 
D2 = vec_mergel (B1, F1); \


46 
E2 = vec_mergeh (C1, G1); \


47 
F2 = vec_mergel (C1, G1); \


48 
G2 = vec_mergeh (D1, H1); \


49 
H2 = vec_mergel (D1, H1); \


50 
\


51 
a = vec_mergeh (A2, E2); \


52 
b = vec_mergel (A2, E2); \


53 
c = vec_mergeh (B2, F2); \


54 
d = vec_mergel (B2, F2); \


55 
e = vec_mergeh (C2, G2); \


56 
f = vec_mergel (C2, G2); \


57 
g = vec_mergeh (D2, H2); \


58 
h = vec_mergel (D2, H2); \


59 
} while (0)


60 


61 
// main steps of 8x8 transform


62 
#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \


63 
do { \


64 
t0 = vec_sl(vec_add(s0, s4), vec_2); \


65 
t0 = vec_add(vec_sl(t0, vec_1), t0); \


66 
t0 = vec_add(t0, vec_rnd); \


67 
t1 = vec_sl(vec_sub(s0, s4), vec_2); \


68 
t1 = vec_add(vec_sl(t1, vec_1), t1); \


69 
t1 = vec_add(t1, vec_rnd); \


70 
t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \


71 
t2 = vec_add(t2, vec_sl(s2, vec_4)); \


72 
t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \


73 
t3 = vec_sub(t3, vec_sl(s6, vec_4)); \


74 
t4 = vec_add(t0, t2); \


75 
t5 = vec_add(t1, t3); \


76 
t6 = vec_sub(t1, t3); \


77 
t7 = vec_sub(t0, t2); \


78 
\


79 
t0 = vec_sl(vec_add(s1, s3), vec_4); \


80 
t0 = vec_add(t0, vec_sl(s5, vec_3)); \


81 
t0 = vec_add(t0, vec_sl(s7, vec_2)); \


82 
t0 = vec_add(t0, vec_sub(s5, s3)); \


83 
\


84 
t1 = vec_sl(vec_sub(s1, s5), vec_4); \


85 
t1 = vec_sub(t1, vec_sl(s7, vec_3)); \


86 
t1 = vec_sub(t1, vec_sl(s3, vec_2)); \


87 
t1 = vec_sub(t1, vec_add(s1, s7)); \


88 
\


89 
t2 = vec_sl(vec_sub(s7, s3), vec_4); \


90 
t2 = vec_add(t2, vec_sl(s1, vec_3)); \


91 
t2 = vec_add(t2, vec_sl(s5, vec_2)); \


92 
t2 = vec_add(t2, vec_sub(s1, s7)); \


93 
\


94 
t3 = vec_sl(vec_sub(s5, s7), vec_4); \


95 
t3 = vec_sub(t3, vec_sl(s3, vec_3)); \


96 
t3 = vec_add(t3, vec_sl(s1, vec_2)); \


97 
t3 = vec_sub(t3, vec_add(s3, s5)); \


98 
\


99 
s0 = vec_add(t4, t0); \


100 
s1 = vec_add(t5, t1); \


101 
s2 = vec_add(t6, t2); \


102 
s3 = vec_add(t7, t3); \


103 
s4 = vec_sub(t7, t3); \


104 
s5 = vec_sub(t6, t2); \


105 
s6 = vec_sub(t5, t1); \


106 
s7 = vec_sub(t4, t0); \


107 
}while(0)


108 


109 
#define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \


110 
do { \


111 
s0 = vec_sra(s0, vec_3); \


112 
s1 = vec_sra(s1, vec_3); \


113 
s2 = vec_sra(s2, vec_3); \


114 
s3 = vec_sra(s3, vec_3); \


115 
s4 = vec_sra(s4, vec_3); \


116 
s5 = vec_sra(s5, vec_3); \


117 
s6 = vec_sra(s6, vec_3); \


118 
s7 = vec_sra(s7, vec_3); \


119 
}while(0)


120 


121 
#define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \


122 
do { \


123 
s0 = vec_sra(s0, vec_7); \


124 
s1 = vec_sra(s1, vec_7); \


125 
s2 = vec_sra(s2, vec_7); \


126 
s3 = vec_sra(s3, vec_7); \


127 
s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \


128 
s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \


129 
s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \


130 
s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \


131 
}while(0)


132 


133 
/* main steps of 4x4 transform */


134 
#define STEP4(s0, s1, s2, s3, vec_rnd) \


135 
do { \


136 
t1 = vec_add(vec_sl(s0, vec_4), s0); \


137 
t1 = vec_add(t1, vec_rnd); \


138 
t2 = vec_add(vec_sl(s2, vec_4), s2); \


139 
t0 = vec_add(t1, t2); \


140 
t1 = vec_sub(t1, t2); \


141 
t3 = vec_sl(vec_sub(s3, s1), vec_1); \


142 
t3 = vec_add(t3, vec_sl(t3, vec_2)); \


143 
t2 = vec_add(t3, vec_sl(s1, vec_5)); \


144 
t3 = vec_add(t3, vec_sl(s3, vec_3)); \


145 
t3 = vec_add(t3, vec_sl(s3, vec_2)); \


146 
s0 = vec_add(t0, t2); \


147 
s1 = vec_sub(t1, t3); \


148 
s2 = vec_add(t1, t3); \


149 
s3 = vec_sub(t0, t2); \


150 
}while (0)


151 


152 
#define SHIFT_HOR4(s0, s1, s2, s3) \


153 
s0 = vec_sra(s0, vec_3); \


154 
s1 = vec_sra(s1, vec_3); \


155 
s2 = vec_sra(s2, vec_3); \


156 
s3 = vec_sra(s3, vec_3);


157 


158 
#define SHIFT_VERT4(s0, s1, s2, s3) \


159 
s0 = vec_sra(s0, vec_7); \


160 
s1 = vec_sra(s1, vec_7); \


161 
s2 = vec_sra(s2, vec_7); \


162 
s3 = vec_sra(s3, vec_7);


163 


164 
/** Do inverse transform on 8x8 block


165 
*/


166 
static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])


167 
{


168 
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;


169 
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;


170 
vector signed int s8, s9, sA, sB, sC, sD, sE, sF;


171 
vector signed int t0, t1, t2, t3, t4, t5, t6, t7;


172 
const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));


173 
const vector unsigned int vec_7 = vec_splat_u32(7);


174 
const vector unsigned int vec_5 = vec_splat_u32(5);


175 
const vector unsigned int vec_4 = vec_splat_u32(4);


176 
const vector signed int vec_4s = vec_splat_s32(4);


177 
const vector unsigned int vec_3 = vec_splat_u32(3);


178 
const vector unsigned int vec_2 = vec_splat_u32(2);


179 
const vector signed int vec_1s = vec_splat_s32(1);


180 
const vector unsigned int vec_1 = vec_splat_u32(1);


181 


182 


183 
src0 = vec_ld( 0, block);


184 
src1 = vec_ld( 16, block);


185 
src2 = vec_ld( 32, block);


186 
src3 = vec_ld( 48, block);


187 
src4 = vec_ld( 64, block);


188 
src5 = vec_ld( 80, block);


189 
src6 = vec_ld( 96, block);


190 
src7 = vec_ld(112, block);


191 


192 
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);


193 
s0 = vec_unpackl(src0);


194 
s1 = vec_unpackl(src1);


195 
s2 = vec_unpackl(src2);


196 
s3 = vec_unpackl(src3);


197 
s4 = vec_unpackl(src4);


198 
s5 = vec_unpackl(src5);


199 
s6 = vec_unpackl(src6);


200 
s7 = vec_unpackl(src7);


201 
s8 = vec_unpackh(src0);


202 
s9 = vec_unpackh(src1);


203 
sA = vec_unpackh(src2);


204 
sB = vec_unpackh(src3);


205 
sC = vec_unpackh(src4);


206 
sD = vec_unpackh(src5);


207 
sE = vec_unpackh(src6);


208 
sF = vec_unpackh(src7);


209 
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);


210 
SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);


211 
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);


212 
SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);


213 
src0 = vec_pack(s8, s0);


214 
src1 = vec_pack(s9, s1);


215 
src2 = vec_pack(sA, s2);


216 
src3 = vec_pack(sB, s3);


217 
src4 = vec_pack(sC, s4);


218 
src5 = vec_pack(sD, s5);


219 
src6 = vec_pack(sE, s6);


220 
src7 = vec_pack(sF, s7);


221 
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);


222 


223 
s0 = vec_unpackl(src0);


224 
s1 = vec_unpackl(src1);


225 
s2 = vec_unpackl(src2);


226 
s3 = vec_unpackl(src3);


227 
s4 = vec_unpackl(src4);


228 
s5 = vec_unpackl(src5);


229 
s6 = vec_unpackl(src6);


230 
s7 = vec_unpackl(src7);


231 
s8 = vec_unpackh(src0);


232 
s9 = vec_unpackh(src1);


233 
sA = vec_unpackh(src2);


234 
sB = vec_unpackh(src3);


235 
sC = vec_unpackh(src4);


236 
sD = vec_unpackh(src5);


237 
sE = vec_unpackh(src6);


238 
sF = vec_unpackh(src7);


239 
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64);


240 
SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7);


241 
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64);


242 
SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF);


243 
src0 = vec_pack(s8, s0);


244 
src1 = vec_pack(s9, s1);


245 
src2 = vec_pack(sA, s2);


246 
src3 = vec_pack(sB, s3);


247 
src4 = vec_pack(sC, s4);


248 
src5 = vec_pack(sD, s5);


249 
src6 = vec_pack(sE, s6);


250 
src7 = vec_pack(sF, s7);


251 


252 
vec_st(src0, 0, block);


253 
vec_st(src1, 16, block);


254 
vec_st(src2, 32, block);


255 
vec_st(src3, 48, block);


256 
vec_st(src4, 64, block);


257 
vec_st(src5, 80, block);


258 
vec_st(src6, 96, block);


259 
vec_st(src7,112, block);


260 
}


261 


262 
/** Do inverse transform on 8x4 part of block


263 
*/


264 
static void vc1_inv_trans_8x4_altivec(DCTELEM block[64], int n)


265 
{


266 
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;


267 
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;


268 
vector signed int s8, s9, sA, sB, sC, sD, sE, sF;


269 
vector signed int t0, t1, t2, t3, t4, t5, t6, t7;


270 
const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));


271 
const vector unsigned int vec_7 = vec_splat_u32(7);


272 
const vector unsigned int vec_5 = vec_splat_u32(5);


273 
const vector unsigned int vec_4 = vec_splat_u32(4);


274 
const vector signed int vec_4s = vec_splat_s32(4);


275 
const vector unsigned int vec_3 = vec_splat_u32(3);


276 
const vector unsigned int vec_2 = vec_splat_u32(2);


277 
const vector unsigned int vec_1 = vec_splat_u32(1);


278 


279 
src0 = vec_ld( 0, block);


280 
src1 = vec_ld( 16, block);


281 
src2 = vec_ld( 32, block);


282 
src3 = vec_ld( 48, block);


283 
src4 = vec_ld( 64, block);


284 
src5 = vec_ld( 80, block);


285 
src6 = vec_ld( 96, block);


286 
src7 = vec_ld(112, block);


287 


288 
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);


289 
s0 = vec_unpackl(src0);


290 
s1 = vec_unpackl(src1);


291 
s2 = vec_unpackl(src2);


292 
s3 = vec_unpackl(src3);


293 
s4 = vec_unpackl(src4);


294 
s5 = vec_unpackl(src5);


295 
s6 = vec_unpackl(src6);


296 
s7 = vec_unpackl(src7);


297 
s8 = vec_unpackh(src0);


298 
s9 = vec_unpackh(src1);


299 
sA = vec_unpackh(src2);


300 
sB = vec_unpackh(src3);


301 
sC = vec_unpackh(src4);


302 
sD = vec_unpackh(src5);


303 
sE = vec_unpackh(src6);


304 
sF = vec_unpackh(src7);


305 
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);


306 
SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);


307 
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);


308 
SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);


309 
src0 = vec_pack(s8, s0);


310 
src1 = vec_pack(s9, s1);


311 
src2 = vec_pack(sA, s2);


312 
src3 = vec_pack(sB, s3);


313 
src4 = vec_pack(sC, s4);


314 
src5 = vec_pack(sD, s5);


315 
src6 = vec_pack(sE, s6);


316 
src7 = vec_pack(sF, s7);


317 
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);


318 


319 
if(!n){ // upper half of block


320 
s0 = vec_unpackh(src0);


321 
s1 = vec_unpackh(src1);


322 
s2 = vec_unpackh(src2);


323 
s3 = vec_unpackh(src3);


324 
s8 = vec_unpackl(src0);


325 
s9 = vec_unpackl(src1);


326 
sA = vec_unpackl(src2);


327 
sB = vec_unpackl(src3);


328 
STEP4(s0, s1, s2, s3, vec_64);


329 
SHIFT_VERT4(s0, s1, s2, s3);


330 
STEP4(s8, s9, sA, sB, vec_64);


331 
SHIFT_VERT4(s8, s9, sA, sB);


332 
src0 = vec_pack(s0, s8);


333 
src1 = vec_pack(s1, s9);


334 
src2 = vec_pack(s2, sA);


335 
src3 = vec_pack(s3, sB);


336 


337 
vec_st(src0, 0, block);


338 
vec_st(src1, 16, block);


339 
vec_st(src2, 32, block);


340 
vec_st(src3, 48, block);


341 
} else { //lower half of block


342 
s0 = vec_unpackh(src4);


343 
s1 = vec_unpackh(src5);


344 
s2 = vec_unpackh(src6);


345 
s3 = vec_unpackh(src7);


346 
s8 = vec_unpackl(src4);


347 
s9 = vec_unpackl(src5);


348 
sA = vec_unpackl(src6);


349 
sB = vec_unpackl(src7);


350 
STEP4(s0, s1, s2, s3, vec_64);


351 
SHIFT_VERT4(s0, s1, s2, s3);


352 
STEP4(s8, s9, sA, sB, vec_64);


353 
SHIFT_VERT4(s8, s9, sA, sB);


354 
src4 = vec_pack(s0, s8);


355 
src5 = vec_pack(s1, s9);


356 
src6 = vec_pack(s2, sA);


357 
src7 = vec_pack(s3, sB);


358 


359 
vec_st(src4, 64, block);


360 
vec_st(src5, 80, block);


361 
vec_st(src6, 96, block);


362 
vec_st(src7,112, block);


363 
}


364 
}


365 


366 


367 
void vc1dsp_init_altivec(DSPContext* dsp, AVCodecContext *avctx) {


368 
dsp>vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec;


369 
dsp>vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec;


370 
}
