Revision 2da0d70d libswscale/swscale_template.c

View differences:

libswscale/swscale_template.c
71 71
#endif
72 72

  
73 73
#define YSCALEYUV2YV12X(x, offset, dest, width) \
74
		asm volatile(\
75
			"xor %%"REG_a", %%"REG_a"	\n\t"\
76
			"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
77
			"movq %%mm3, %%mm4		\n\t"\
78
			"lea " offset "(%0), %%"REG_d"	\n\t"\
79
			"mov (%%"REG_d"), %%"REG_S"	\n\t"\
80
			ASMALIGN(4) /* FIXME Unroll? */\
81
			"1:				\n\t"\
82
			"movq 8(%%"REG_d"), %%mm0	\n\t" /* filterCoeff */\
83
			"movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
84
			"movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
85
			"add $16, %%"REG_d"		\n\t"\
86
			"mov (%%"REG_d"), %%"REG_S"	\n\t"\
87
			"test %%"REG_S", %%"REG_S"	\n\t"\
88
			"pmulhw %%mm0, %%mm2		\n\t"\
89
			"pmulhw %%mm0, %%mm5		\n\t"\
90
			"paddw %%mm2, %%mm3		\n\t"\
91
			"paddw %%mm5, %%mm4		\n\t"\
92
			" jnz 1b			\n\t"\
93
			"psraw $3, %%mm3		\n\t"\
94
			"psraw $3, %%mm4		\n\t"\
95
			"packuswb %%mm4, %%mm3		\n\t"\
96
			MOVNTQ(%%mm3, (%1, %%REGa))\
97
			"add $8, %%"REG_a"		\n\t"\
98
			"cmp %2, %%"REG_a"		\n\t"\
99
			"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
100
			"movq %%mm3, %%mm4		\n\t"\
101
			"lea " offset "(%0), %%"REG_d"	\n\t"\
102
			"mov (%%"REG_d"), %%"REG_S"	\n\t"\
103
			"jb 1b				\n\t"\
104
                        :: "r" (&c->redDither),\
105
                        "r" (dest), "g" (width)\
106
                        : "%"REG_a, "%"REG_d, "%"REG_S\
107
                );
74
    asm volatile(\
75
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
76
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
77
    "movq                             %%mm3, %%mm4      \n\t"\
78
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
79
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
80
    ASMALIGN(4) /* FIXME Unroll? */\
81
    "1:                                                 \n\t"\
82
    "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
83
    "movq   " #x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
84
    "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
85
    "add                                $16, %%"REG_d"  \n\t"\
86
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
87
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
88
    "pmulhw                           %%mm0, %%mm2      \n\t"\
89
    "pmulhw                           %%mm0, %%mm5      \n\t"\
90
    "paddw                            %%mm2, %%mm3      \n\t"\
91
    "paddw                            %%mm5, %%mm4      \n\t"\
92
    " jnz                                1b             \n\t"\
93
    "psraw                               $3, %%mm3      \n\t"\
94
    "psraw                               $3, %%mm4      \n\t"\
95
    "packuswb                         %%mm4, %%mm3      \n\t"\
96
    MOVNTQ(%%mm3, (%1, %%REGa))\
97
    "add                                 $8, %%"REG_a"  \n\t"\
98
    "cmp                                 %2, %%"REG_a"  \n\t"\
99
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
100
    "movq                             %%mm3, %%mm4      \n\t"\
101
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
102
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103
    "jb                                  1b             \n\t"\
104
    :: "r" (&c->redDither),\
105
    "r" (dest), "g" (width)\
106
    : "%"REG_a, "%"REG_d, "%"REG_S\
107
    );
108 108

  
109 109
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110
		asm volatile(\
111
			"lea " offset "(%0), %%"REG_d"	\n\t"\
112
			"xor %%"REG_a", %%"REG_a"	\n\t"\
113
                        "pxor %%mm4, %%mm4              \n\t"\
114
                        "pxor %%mm5, %%mm5              \n\t"\
115
                        "pxor %%mm6, %%mm6              \n\t"\
116
                        "pxor %%mm7, %%mm7              \n\t"\
117
			"mov (%%"REG_d"), %%"REG_S"	\n\t"\
118
			ASMALIGN(4) \
119
			"1:				\n\t"\
120
			"movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0\n\t" /* srcData */\
121
			"movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
122
			"mov 4(%%"REG_d"), %%"REG_S"	\n\t"\
123
			"movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1\n\t" /* srcData */\
124
                        "movq %%mm0, %%mm3              \n\t"\
125
                        "punpcklwd %%mm1, %%mm0        \n\t"\
126
                        "punpckhwd %%mm1, %%mm3        \n\t"\
127
			"movq 8(%%"REG_d"), %%mm1	\n\t" /* filterCoeff */\
128
                        "pmaddwd %%mm1, %%mm0           \n\t"\
129
                        "pmaddwd %%mm1, %%mm3           \n\t"\
130
                        "paddd %%mm0, %%mm4             \n\t"\
131
                        "paddd %%mm3, %%mm5             \n\t"\
132
			"movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3\n\t" /* srcData */\
133
			"mov 16(%%"REG_d"), %%"REG_S"	\n\t"\
134
			"add $16, %%"REG_d"		\n\t"\
135
                        "test %%"REG_S", %%"REG_S"      \n\t"\
136
                        "movq %%mm2, %%mm0              \n\t"\
137
                        "punpcklwd %%mm3, %%mm2        \n\t"\
138
                        "punpckhwd %%mm3, %%mm0        \n\t"\
139
                        "pmaddwd %%mm1, %%mm2           \n\t"\
140
                        "pmaddwd %%mm1, %%mm0           \n\t"\
141
                        "paddd %%mm2, %%mm6             \n\t"\
142
                        "paddd %%mm0, %%mm7             \n\t"\
143
			" jnz 1b			\n\t"\
144
			"psrad $16, %%mm4		\n\t"\
145
			"psrad $16, %%mm5		\n\t"\
146
			"psrad $16, %%mm6		\n\t"\
147
			"psrad $16, %%mm7		\n\t"\
148
			"movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
149
			"packssdw %%mm5, %%mm4		\n\t"\
150
			"packssdw %%mm7, %%mm6		\n\t"\
151
                        "paddw %%mm0, %%mm4             \n\t"\
152
                        "paddw %%mm0, %%mm6             \n\t"\
153
			"psraw $3, %%mm4		\n\t"\
154
			"psraw $3, %%mm6		\n\t"\
155
			"packuswb %%mm6, %%mm4		\n\t"\
156
			MOVNTQ(%%mm4, (%1, %%REGa))\
157
			"add $8, %%"REG_a"		\n\t"\
158
			"cmp %2, %%"REG_a"		\n\t"\
159
			"lea " offset "(%0), %%"REG_d"	\n\t"\
160
                        "pxor %%mm4, %%mm4              \n\t"\
161
                        "pxor %%mm5, %%mm5              \n\t"\
162
                        "pxor %%mm6, %%mm6              \n\t"\
163
                        "pxor %%mm7, %%mm7              \n\t"\
164
			"mov (%%"REG_d"), %%"REG_S"	\n\t"\
165
			"jb 1b				\n\t"\
166
                        :: "r" (&c->redDither),\
167
                        "r" (dest), "g" (width)\
168
                        : "%"REG_a, "%"REG_d, "%"REG_S\
169
                );
110
    asm volatile(\
111
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
112
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
113
    "pxor                             %%mm4, %%mm4      \n\t"\
114
    "pxor                             %%mm5, %%mm5      \n\t"\
115
    "pxor                             %%mm6, %%mm6      \n\t"\
116
    "pxor                             %%mm7, %%mm7      \n\t"\
117
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
118
    ASMALIGN(4) \
119
    "1:                                                 \n\t"\
120
    "movq   " #x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
121
    "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
122
    "mov                       4(%%"REG_d"), %%"REG_S"  \n\t"\
123
    "movq   " #x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
124
    "movq                             %%mm0, %%mm3      \n\t"\
125
    "punpcklwd                        %%mm1, %%mm0      \n\t"\
126
    "punpckhwd                        %%mm1, %%mm3      \n\t"\
127
    "movq                      8(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
128
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
129
    "pmaddwd                          %%mm1, %%mm3      \n\t"\
130
    "paddd                            %%mm0, %%mm4      \n\t"\
131
    "paddd                            %%mm3, %%mm5      \n\t"\
132
    "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
133
    "mov                      16(%%"REG_d"), %%"REG_S"  \n\t"\
134
    "add                                $16, %%"REG_d"  \n\t"\
135
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
136
    "movq                             %%mm2, %%mm0      \n\t"\
137
    "punpcklwd                        %%mm3, %%mm2      \n\t"\
138
    "punpckhwd                        %%mm3, %%mm0      \n\t"\
139
    "pmaddwd                          %%mm1, %%mm2      \n\t"\
140
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
141
    "paddd                            %%mm2, %%mm6      \n\t"\
142
    "paddd                            %%mm0, %%mm7      \n\t"\
143
    " jnz                                1b             \n\t"\
144
    "psrad                              $16, %%mm4      \n\t"\
145
    "psrad                              $16, %%mm5      \n\t"\
146
    "psrad                              $16, %%mm6      \n\t"\
147
    "psrad                              $16, %%mm7      \n\t"\
148
    "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
149
    "packssdw                         %%mm5, %%mm4      \n\t"\
150
    "packssdw                         %%mm7, %%mm6      \n\t"\
151
    "paddw                            %%mm0, %%mm4      \n\t"\
152
    "paddw                            %%mm0, %%mm6      \n\t"\
153
    "psraw                               $3, %%mm4      \n\t"\
154
    "psraw                               $3, %%mm6      \n\t"\
155
    "packuswb                         %%mm6, %%mm4      \n\t"\
156
    MOVNTQ(%%mm4, (%1, %%REGa))\
157
    "add                                 $8, %%"REG_a"  \n\t"\
158
    "cmp                                 %2, %%"REG_a"  \n\t"\
159
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
160
    "pxor                             %%mm4, %%mm4      \n\t"\
161
    "pxor                             %%mm5, %%mm5      \n\t"\
162
    "pxor                             %%mm6, %%mm6      \n\t"\
163
    "pxor                             %%mm7, %%mm7      \n\t"\
164
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
165
    "jb                                  1b             \n\t"\
166
    :: "r" (&c->redDither),\
167
    "r" (dest), "g" (width)\
168
    : "%"REG_a, "%"REG_d, "%"REG_S\
169
    );
170 170

  
171 171
#define YSCALEYUV2YV121 \
172
			"mov %2, %%"REG_a"		\n\t"\
173
			ASMALIGN(4) /* FIXME Unroll? */\
174
			"1:				\n\t"\
175
			"movq (%0, %%"REG_a", 2), %%mm0	\n\t"\
176
			"movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
177
			"psraw $7, %%mm0		\n\t"\
178
			"psraw $7, %%mm1		\n\t"\
179
			"packuswb %%mm1, %%mm0		\n\t"\
180
			MOVNTQ(%%mm0, (%1, %%REGa))\
181
			"add $8, %%"REG_a"		\n\t"\
182
			"jnc 1b				\n\t"
172
    "mov %2, %%"REG_a"                    \n\t"\
173
    ASMALIGN(4) /* FIXME Unroll? */\
174
    "1:                                   \n\t"\
175
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
176
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
177
    "psraw                 $7, %%mm0      \n\t"\
178
    "psraw                 $7, %%mm1      \n\t"\
179
    "packuswb           %%mm1, %%mm0      \n\t"\
180
    MOVNTQ(%%mm0, (%1, %%REGa))\
181
    "add                   $8, %%"REG_a"  \n\t"\
182
    "jnc                   1b             \n\t"
183 183

  
184 184
/*
185
			:: "m" (-lumFilterSize), "m" (-chrFilterSize),
186
			   "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
187
			   "r" (dest), "m" (dstW),
188
			   "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
189
			: "%eax", "%ebx", "%ecx", "%edx", "%esi"
185
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
186
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
187
       "r" (dest), "m" (dstW),
188
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
189
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
190 190
*/
191 191
#define YSCALEYUV2PACKEDX \
192
	asm volatile(\
193
		"xor %%"REG_a", %%"REG_a"	\n\t"\
194
		ASMALIGN(4)\
195
		"nop				\n\t"\
196
		"1:				\n\t"\
197
		"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
198
		"mov (%%"REG_d"), %%"REG_S"	\n\t"\
199
		"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
200
		"movq %%mm3, %%mm4		\n\t"\
201
		ASMALIGN(4)\
202
		"2:				\n\t"\
203
		"movq 8(%%"REG_d"), %%mm0	\n\t" /* filterCoeff */\
204
		"movq (%%"REG_S", %%"REG_a"), %%mm2	\n\t" /* UsrcData */\
205
		"movq 4096(%%"REG_S", %%"REG_a"), %%mm5	\n\t" /* VsrcData */\
206
		"add $16, %%"REG_d"		\n\t"\
207
		"mov (%%"REG_d"), %%"REG_S"	\n\t"\
208
		"pmulhw %%mm0, %%mm2		\n\t"\
209
		"pmulhw %%mm0, %%mm5		\n\t"\
210
		"paddw %%mm2, %%mm3		\n\t"\
211
		"paddw %%mm5, %%mm4		\n\t"\
212
		"test %%"REG_S", %%"REG_S"	\n\t"\
213
		" jnz 2b			\n\t"\
192
    asm volatile(\
193
    "xor                   %%"REG_a", %%"REG_a"     \n\t"\
194
    ASMALIGN(4)\
195
    "nop                                            \n\t"\
196
    "1:                                             \n\t"\
197
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
198
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
199
    "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
200
    "movq                      %%mm3, %%mm4         \n\t"\
201
    ASMALIGN(4)\
202
    "2:                                             \n\t"\
203
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
204
    "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
205
    "movq 4096(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
206
    "add                         $16, %%"REG_d"     \n\t"\
207
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
208
    "pmulhw                    %%mm0, %%mm2         \n\t"\
209
    "pmulhw                    %%mm0, %%mm5         \n\t"\
210
    "paddw                     %%mm2, %%mm3         \n\t"\
211
    "paddw                     %%mm5, %%mm4         \n\t"\
212
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
213
    " jnz                         2b                \n\t"\
214 214
\
215
		"lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
216
		"mov (%%"REG_d"), %%"REG_S"	\n\t"\
217
		"movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
218
		"movq %%mm1, %%mm7		\n\t"\
219
		ASMALIGN(4)\
220
		"2:				\n\t"\
221
		"movq 8(%%"REG_d"), %%mm0	\n\t" /* filterCoeff */\
222
		"movq (%%"REG_S", %%"REG_a", 2), %%mm2	\n\t" /* Y1srcData */\
223
		"movq 8(%%"REG_S", %%"REG_a", 2), %%mm5	\n\t" /* Y2srcData */\
224
		"add $16, %%"REG_d"		\n\t"\
225
		"mov (%%"REG_d"), %%"REG_S"	\n\t"\
226
		"pmulhw %%mm0, %%mm2		\n\t"\
227
		"pmulhw %%mm0, %%mm5		\n\t"\
228
		"paddw %%mm2, %%mm1		\n\t"\
229
		"paddw %%mm5, %%mm7		\n\t"\
230
		"test %%"REG_S", %%"REG_S"	\n\t"\
231
		" jnz 2b			\n\t"\
232

  
233
#define YSCALEYUV2PACKEDX_END\
234
        :: "r" (&c->redDither), \
235
            "m" (dummy), "m" (dummy), "m" (dummy),\
236
            "r" (dest), "m" (dstW)\
237
        : "%"REG_a, "%"REG_d, "%"REG_S\
238
        );
215
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
216
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
217
    "movq      "VROUNDER_OFFSET"(%0), %%mm1         \n\t"\
218
    "movq                      %%mm1, %%mm7         \n\t"\
219
    ASMALIGN(4)\
220
    "2:                                             \n\t"\
221
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
222
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y1srcData */\
223
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5         \n\t" /* Y2srcData */\
224
    "add                         $16, %%"REG_d"            \n\t"\
225
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
226
    "pmulhw                    %%mm0, %%mm2         \n\t"\
227
    "pmulhw                    %%mm0, %%mm5         \n\t"\
228
    "paddw                     %%mm2, %%mm1         \n\t"\
229
    "paddw                     %%mm5, %%mm7         \n\t"\
230
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
231
    " jnz                         2b                \n\t"\
232

  
233
#define YSCALEYUV2PACKEDX_END                 \
234
    :: "r" (&c->redDither),                   \
235
        "m" (dummy), "m" (dummy), "m" (dummy),\
236
        "r" (dest), "m" (dstW)                \
237
    : "%"REG_a, "%"REG_d, "%"REG_S            \
238
    );
239 239

  
240 240
#define YSCALEYUV2PACKEDX_ACCURATE \
241
	asm volatile(\
242
		"xor %%"REG_a", %%"REG_a"	\n\t"\
243
		ASMALIGN(4)\
244
		"nop				\n\t"\
245
		"1:				\n\t"\
246
		"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
247
		"mov (%%"REG_d"), %%"REG_S"	\n\t"\
248
                "pxor %%mm4, %%mm4              \n\t"\
249
                "pxor %%mm5, %%mm5              \n\t"\
250
                "pxor %%mm6, %%mm6              \n\t"\
251
                "pxor %%mm7, %%mm7              \n\t"\
252
		ASMALIGN(4)\
253
		"2:				\n\t"\
254
		"movq (%%"REG_S", %%"REG_a"), %%mm0	\n\t" /* UsrcData */\
255
		"movq 4096(%%"REG_S", %%"REG_a"), %%mm2	\n\t" /* VsrcData */\
256
		"mov 4(%%"REG_d"), %%"REG_S"	\n\t"\
257
		"movq (%%"REG_S", %%"REG_a"), %%mm1	\n\t" /* UsrcData */\
258
                "movq %%mm0, %%mm3              \n\t"\
259
                "punpcklwd %%mm1, %%mm0        \n\t"\
260
                "punpckhwd %%mm1, %%mm3        \n\t"\
261
                "movq 8(%%"REG_d"), %%mm1	\n\t" /* filterCoeff */\
262
                "pmaddwd %%mm1, %%mm0           \n\t"\
263
                "pmaddwd %%mm1, %%mm3           \n\t"\
264
                "paddd %%mm0, %%mm4             \n\t"\
265
                "paddd %%mm3, %%mm5             \n\t"\
266
		"movq 4096(%%"REG_S", %%"REG_a"), %%mm3	\n\t" /* VsrcData */\
267
                "mov 16(%%"REG_d"), %%"REG_S"	\n\t"\
268
		"add $16, %%"REG_d"		\n\t"\
269
                "test %%"REG_S", %%"REG_S"      \n\t"\
270
                "movq %%mm2, %%mm0              \n\t"\
271
                "punpcklwd %%mm3, %%mm2        \n\t"\
272
                "punpckhwd %%mm3, %%mm0        \n\t"\
273
                "pmaddwd %%mm1, %%mm2           \n\t"\
274
                "pmaddwd %%mm1, %%mm0           \n\t"\
275
                "paddd %%mm2, %%mm6             \n\t"\
276
                "paddd %%mm0, %%mm7             \n\t"\
277
		" jnz 2b			\n\t"\
278
                "psrad $16, %%mm4		\n\t"\
279
                "psrad $16, %%mm5		\n\t"\
280
                "psrad $16, %%mm6		\n\t"\
281
                "psrad $16, %%mm7		\n\t"\
282
                "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
283
                "packssdw %%mm5, %%mm4		\n\t"\
284
                "packssdw %%mm7, %%mm6		\n\t"\
285
                "paddw %%mm0, %%mm4             \n\t"\
286
                "paddw %%mm0, %%mm6             \n\t"\
287
                "movq %%mm4, "U_TEMP"(%0)       \n\t"\
288
                "movq %%mm6, "V_TEMP"(%0)       \n\t"\
241
    asm volatile(\
242
    "xor %%"REG_a", %%"REG_a"                       \n\t"\
243
    ASMALIGN(4)\
244
    "nop                                            \n\t"\
245
    "1:                                             \n\t"\
246
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
247
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
248
    "pxor                      %%mm4, %%mm4         \n\t"\
249
    "pxor                      %%mm5, %%mm5         \n\t"\
250
    "pxor                      %%mm6, %%mm6         \n\t"\
251
    "pxor                      %%mm7, %%mm7         \n\t"\
252
    ASMALIGN(4)\
253
    "2:                                             \n\t"\
254
    "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
255
    "movq 4096(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
256
    "mov                4(%%"REG_d"), %%"REG_S"     \n\t"\
257
    "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
258
    "movq                      %%mm0, %%mm3         \n\t"\
259
    "punpcklwd                 %%mm1, %%mm0         \n\t"\
260
    "punpckhwd                 %%mm1, %%mm3         \n\t"\
261
    "movq               8(%%"REG_d"), %%mm1         \n\t" /* filterCoeff */\
262
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
263
    "pmaddwd                   %%mm1, %%mm3         \n\t"\
264
    "paddd                     %%mm0, %%mm4         \n\t"\
265
    "paddd                     %%mm3, %%mm5         \n\t"\
266
    "movq 4096(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
267
    "mov               16(%%"REG_d"), %%"REG_S"     \n\t"\
268
    "add                         $16, %%"REG_d"     \n\t"\
269
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
270
    "movq                      %%mm2, %%mm0         \n\t"\
271
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
272
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
273
    "pmaddwd                   %%mm1, %%mm2         \n\t"\
274
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
275
    "paddd                     %%mm2, %%mm6         \n\t"\
276
    "paddd                     %%mm0, %%mm7         \n\t"\
277
    " jnz                         2b                \n\t"\
278
    "psrad                       $16, %%mm4         \n\t"\
279
    "psrad                       $16, %%mm5         \n\t"\
280
    "psrad                       $16, %%mm6         \n\t"\
281
    "psrad                       $16, %%mm7         \n\t"\
282
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
283
    "packssdw                  %%mm5, %%mm4         \n\t"\
284
    "packssdw                  %%mm7, %%mm6         \n\t"\
285
    "paddw                     %%mm0, %%mm4         \n\t"\
286
    "paddw                     %%mm0, %%mm6         \n\t"\
287
    "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
288
    "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
289 289
\
290
		"lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
291
		"mov (%%"REG_d"), %%"REG_S"	\n\t"\
292
                "pxor %%mm1, %%mm1              \n\t"\
293
                "pxor %%mm5, %%mm5              \n\t"\
294
                "pxor %%mm7, %%mm7              \n\t"\
295
                "pxor %%mm6, %%mm6              \n\t"\
296
		ASMALIGN(4)\
297
		"2:				\n\t"\
298
		"movq (%%"REG_S", %%"REG_a", 2), %%mm0	\n\t" /* Y1srcData */\
299
		"movq 8(%%"REG_S", %%"REG_a", 2), %%mm2	\n\t" /* Y2srcData */\
300
		"mov 4(%%"REG_d"), %%"REG_S"	\n\t"\
301
		"movq (%%"REG_S", %%"REG_a", 2), %%mm4	\n\t" /* Y1srcData */\
302
                "movq %%mm0, %%mm3              \n\t"\
303
                "punpcklwd %%mm4, %%mm0        \n\t"\
304
                "punpckhwd %%mm4, %%mm3        \n\t"\
305
                "movq 8(%%"REG_d"), %%mm4	\n\t" /* filterCoeff */\
306
                "pmaddwd %%mm4, %%mm0           \n\t"\
307
                "pmaddwd %%mm4, %%mm3           \n\t"\
308
                "paddd %%mm0, %%mm1             \n\t"\
309
                "paddd %%mm3, %%mm5             \n\t"\
310
		"movq 8(%%"REG_S", %%"REG_a", 2), %%mm3	\n\t" /* Y2srcData */\
311
                "mov 16(%%"REG_d"), %%"REG_S"	\n\t"\
312
		"add $16, %%"REG_d"		\n\t"\
313
                "test %%"REG_S", %%"REG_S"      \n\t"\
314
                "movq %%mm2, %%mm0              \n\t"\
315
                "punpcklwd %%mm3, %%mm2        \n\t"\
316
                "punpckhwd %%mm3, %%mm0        \n\t"\
317
                "pmaddwd %%mm4, %%mm2           \n\t"\
318
                "pmaddwd %%mm4, %%mm0           \n\t"\
319
                "paddd %%mm2, %%mm7             \n\t"\
320
                "paddd %%mm0, %%mm6             \n\t"\
321
		" jnz 2b			\n\t"\
322
                "psrad $16, %%mm1		\n\t"\
323
                "psrad $16, %%mm5		\n\t"\
324
                "psrad $16, %%mm7		\n\t"\
325
                "psrad $16, %%mm6		\n\t"\
326
                "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
327
                "packssdw %%mm5, %%mm1		\n\t"\
328
                "packssdw %%mm6, %%mm7		\n\t"\
329
                "paddw %%mm0, %%mm1             \n\t"\
330
                "paddw %%mm0, %%mm7             \n\t"\
331
                "movq  "U_TEMP"(%0), %%mm3      \n\t"\
332
                "movq  "V_TEMP"(%0), %%mm4      \n\t"\
290
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
291
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
292
    "pxor                      %%mm1, %%mm1         \n\t"\
293
    "pxor                      %%mm5, %%mm5         \n\t"\
294
    "pxor                      %%mm7, %%mm7         \n\t"\
295
    "pxor                      %%mm6, %%mm6         \n\t"\
296
    ASMALIGN(4)\
297
    "2:                                             \n\t"\
298
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
299
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
300
    "mov                4(%%"REG_d"), %%"REG_S"     \n\t"\
301
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
302
    "movq                      %%mm0, %%mm3         \n\t"\
303
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
304
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
305
    "movq               8(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
306
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
307
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
308
    "paddd                     %%mm0, %%mm1         \n\t"\
309
    "paddd                     %%mm3, %%mm5         \n\t"\
310
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
311
    "mov               16(%%"REG_d"), %%"REG_S"     \n\t"\
312
    "add                         $16, %%"REG_d"     \n\t"\
313
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
314
    "movq                      %%mm2, %%mm0         \n\t"\
315
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
316
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
317
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
318
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
319
    "paddd                     %%mm2, %%mm7         \n\t"\
320
    "paddd                     %%mm0, %%mm6         \n\t"\
321
    " jnz                         2b                \n\t"\
322
    "psrad                       $16, %%mm1         \n\t"\
323
    "psrad                       $16, %%mm5         \n\t"\
324
    "psrad                       $16, %%mm7         \n\t"\
325
    "psrad                       $16, %%mm6         \n\t"\
326
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
327
    "packssdw                  %%mm5, %%mm1         \n\t"\
328
    "packssdw                  %%mm6, %%mm7         \n\t"\
329
    "paddw                     %%mm0, %%mm1         \n\t"\
330
    "paddw                     %%mm0, %%mm7         \n\t"\
331
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
332
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
333 333

  
334 334
#define YSCALEYUV2RGBX \
335
		"psubw "U_OFFSET"(%0), %%mm3	\n\t" /* (U-128)8*/\
336
		"psubw "V_OFFSET"(%0), %%mm4	\n\t" /* (V-128)8*/\
337
		"movq %%mm3, %%mm2		\n\t" /* (U-128)8*/\
338
		"movq %%mm4, %%mm5		\n\t" /* (V-128)8*/\
339
		"pmulhw "UG_COEFF"(%0), %%mm3	\n\t"\
340
		"pmulhw "VG_COEFF"(%0), %%mm4	\n\t"\
341
	/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
342
		"pmulhw "UB_COEFF"(%0), %%mm2	\n\t"\
343
		"pmulhw "VR_COEFF"(%0), %%mm5	\n\t"\
344
		"psubw "Y_OFFSET"(%0), %%mm1	\n\t" /* 8(Y-16)*/\
345
		"psubw "Y_OFFSET"(%0), %%mm7	\n\t" /* 8(Y-16)*/\
346
		"pmulhw "Y_COEFF"(%0), %%mm1	\n\t"\
347
		"pmulhw "Y_COEFF"(%0), %%mm7	\n\t"\
348
	/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
349
		"paddw %%mm3, %%mm4		\n\t"\
350
		"movq %%mm2, %%mm0		\n\t"\
351
		"movq %%mm5, %%mm6		\n\t"\
352
		"movq %%mm4, %%mm3		\n\t"\
353
		"punpcklwd %%mm2, %%mm2		\n\t"\
354
		"punpcklwd %%mm5, %%mm5		\n\t"\
355
		"punpcklwd %%mm4, %%mm4		\n\t"\
356
		"paddw %%mm1, %%mm2		\n\t"\
357
		"paddw %%mm1, %%mm5		\n\t"\
358
		"paddw %%mm1, %%mm4		\n\t"\
359
		"punpckhwd %%mm0, %%mm0		\n\t"\
360
		"punpckhwd %%mm6, %%mm6		\n\t"\
361
		"punpckhwd %%mm3, %%mm3		\n\t"\
362
		"paddw %%mm7, %%mm0		\n\t"\
363
		"paddw %%mm7, %%mm6		\n\t"\
364
		"paddw %%mm7, %%mm3		\n\t"\
365
		/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
366
		"packuswb %%mm0, %%mm2		\n\t"\
367
		"packuswb %%mm6, %%mm5		\n\t"\
368
		"packuswb %%mm3, %%mm4		\n\t"\
369
		"pxor %%mm7, %%mm7		\n\t"
335
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
336
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
337
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
338
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
339
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
340
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
341
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
342
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
343
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
344
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
345
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
346
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
347
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
348
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
349
    "paddw           %%mm3, %%mm4       \n\t"\
350
    "movq            %%mm2, %%mm0       \n\t"\
351
    "movq            %%mm5, %%mm6       \n\t"\
352
    "movq            %%mm4, %%mm3       \n\t"\
353
    "punpcklwd       %%mm2, %%mm2       \n\t"\
354
    "punpcklwd       %%mm5, %%mm5       \n\t"\
355
    "punpcklwd       %%mm4, %%mm4       \n\t"\
356
    "paddw           %%mm1, %%mm2       \n\t"\
357
    "paddw           %%mm1, %%mm5       \n\t"\
358
    "paddw           %%mm1, %%mm4       \n\t"\
359
    "punpckhwd       %%mm0, %%mm0       \n\t"\
360
    "punpckhwd       %%mm6, %%mm6       \n\t"\
361
    "punpckhwd       %%mm3, %%mm3       \n\t"\
362
    "paddw           %%mm7, %%mm0       \n\t"\
363
    "paddw           %%mm7, %%mm6       \n\t"\
364
    "paddw           %%mm7, %%mm3       \n\t"\
365
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
366
    "packuswb        %%mm0, %%mm2       \n\t"\
367
    "packuswb        %%mm6, %%mm5       \n\t"\
368
    "packuswb        %%mm3, %%mm4       \n\t"\
369
    "pxor            %%mm7, %%mm7       \n\t"
370 370
#if 0
371 371
#define FULL_YSCALEYUV2RGB \
372
		"pxor %%mm7, %%mm7		\n\t"\
373
		"movd %6, %%mm6			\n\t" /*yalpha1*/\
374
		"punpcklwd %%mm6, %%mm6		\n\t"\
375
		"punpcklwd %%mm6, %%mm6		\n\t"\
376
		"movd %7, %%mm5			\n\t" /*uvalpha1*/\
377
		"punpcklwd %%mm5, %%mm5		\n\t"\
378
		"punpcklwd %%mm5, %%mm5		\n\t"\
379
		"xor %%"REG_a", %%"REG_a"		\n\t"\
380
		ASMALIGN(4)\
381
		"1:				\n\t"\
382
		"movq (%0, %%"REG_a", 2), %%mm0	\n\t" /*buf0[eax]*/\
383
		"movq (%1, %%"REG_a", 2), %%mm1	\n\t" /*buf1[eax]*/\
384
		"movq (%2, %%"REG_a",2), %%mm2	\n\t" /* uvbuf0[eax]*/\
385
		"movq (%3, %%"REG_a",2), %%mm3	\n\t" /* uvbuf1[eax]*/\
386
		"psubw %%mm1, %%mm0		\n\t" /* buf0[eax] - buf1[eax]*/\
387
		"psubw %%mm3, %%mm2		\n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
388
		"pmulhw %%mm6, %%mm0		\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
389
		"pmulhw %%mm5, %%mm2		\n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390
		"psraw $4, %%mm1		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
391
		"movq 4096(%2, %%"REG_a",2), %%mm4	\n\t" /* uvbuf0[eax+2048]*/\
392
		"psraw $4, %%mm3		\n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
393
		"paddw %%mm0, %%mm1		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
394
		"movq 4096(%3, %%"REG_a",2), %%mm0	\n\t" /* uvbuf1[eax+2048]*/\
395
		"paddw %%mm2, %%mm3		\n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396
		"psubw %%mm0, %%mm4		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
397
		"psubw "MANGLE(w80)", %%mm1	\n\t" /* 8(Y-16)*/\
398
		"psubw "MANGLE(w400)", %%mm3	\n\t" /* 8(U-128)*/\
399
		"pmulhw "MANGLE(yCoeff)", %%mm1	\n\t"\
372
    "pxor                 %%mm7, %%mm7  \n\t"\
373
    "movd                    %6, %%mm6  \n\t" /*yalpha1*/\
374
    "punpcklwd            %%mm6, %%mm6  \n\t"\
375
    "punpcklwd            %%mm6, %%mm6  \n\t"\
376
    "movd                    %7, %%mm5  \n\t" /*uvalpha1*/\
377
    "punpcklwd            %%mm5, %%mm5  \n\t"\
378
    "punpcklwd            %%mm5, %%mm5  \n\t"\
379
    "xor              %%"REG_a", %%"REG_a"  \n\t"\
380
    ASMALIGN(4)\
381
    "1:                                 \n\t"\
382
    "movq     (%0, %%"REG_a",2), %%mm0  \n\t" /*buf0[eax]*/\
383
    "movq     (%1, %%"REG_a",2), %%mm1  \n\t" /*buf1[eax]*/\
384
    "movq     (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
385
    "movq     (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
386
    "psubw                %%mm1, %%mm0  \n\t" /* buf0[eax] - buf1[eax]*/\
387
    "psubw                %%mm3, %%mm2  \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
388
    "pmulhw               %%mm6, %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
389
    "pmulhw               %%mm5, %%mm2  \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390
    "psraw                   $4, %%mm1  \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391
    "movq 4096(%2, %%"REG_a",2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
392
    "psraw                   $4, %%mm3  \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
393
    "paddw                %%mm0, %%mm1  \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
394
    "movq 4096(%3, %%"REG_a",2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
395
    "paddw                %%mm2, %%mm3  \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396
    "psubw                %%mm0, %%mm4  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
397
    "psubw        "MANGLE(w80)", %%mm1  \n\t" /* 8(Y-16)*/\
398
    "psubw       "MANGLE(w400)", %%mm3  \n\t" /* 8(U-128)*/\
399
    "pmulhw    "MANGLE(yCoeff)", %%mm1  \n\t"\
400 400
\
401 401
\
402
		"pmulhw %%mm5, %%mm4		\n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
403
		"movq %%mm3, %%mm2		\n\t" /* (U-128)8*/\
404
		"pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
405
		"psraw $4, %%mm0		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
406
		"pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
407
		"paddw %%mm4, %%mm0		\n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
408
		"psubw "MANGLE(w400)", %%mm0	\n\t" /* (V-128)8*/\
402
    "pmulhw               %%mm5, %%mm4  \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
403
    "movq                 %%mm3, %%mm2  \n\t" /* (U-128)8*/\
404
    "pmulhw   "MANGLE(ubCoeff)", %%mm3  \n\t"\
405
    "psraw                   $4, %%mm0  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
406
    "pmulhw   "MANGLE(ugCoeff)", %%mm2  \n\t"\
407
    "paddw                %%mm4, %%mm0  \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
408
    "psubw       "MANGLE(w400)", %%mm0  \n\t" /* (V-128)8*/\
409 409
\
410 410
\
411
		"movq %%mm0, %%mm4		\n\t" /* (V-128)8*/\
412
		"pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
413
		"pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
414
		"paddw %%mm1, %%mm3		\n\t" /* B*/\
415
		"paddw %%mm1, %%mm0		\n\t" /* R*/\
416
		"packuswb %%mm3, %%mm3		\n\t"\
411
    "movq                 %%mm0, %%mm4  \n\t" /* (V-128)8*/\
412
    "pmulhw   "MANGLE(vrCoeff)", %%mm0  \n\t"\
413
    "pmulhw   "MANGLE(vgCoeff)", %%mm4  \n\t"\
414
    "paddw                %%mm1, %%mm3  \n\t" /* B*/\
415
    "paddw                %%mm1, %%mm0  \n\t" /* R*/\
416
    "packuswb             %%mm3, %%mm3  \n\t"\
417 417
\
418
		"packuswb %%mm0, %%mm0		\n\t"\
419
		"paddw %%mm4, %%mm2		\n\t"\
420
		"paddw %%mm2, %%mm1		\n\t" /* G*/\
418
    "packuswb             %%mm0, %%mm0  \n\t"\
419
    "paddw                %%mm4, %%mm2  \n\t"\
420
    "paddw                %%mm2, %%mm1  \n\t" /* G*/\
421 421
\
422
		"packuswb %%mm1, %%mm1		\n\t"
422
    "packuswb             %%mm1, %%mm1  \n\t"
423 423
#endif
424 424

  
425 425
#define REAL_YSCALEYUV2PACKED(index, c) \
426
		"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
427
		"movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
428
		"psraw $3, %%mm0		\n\t"\
429
		"psraw $3, %%mm1		\n\t"\
430
		"movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
431
		"movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
432
		"xor "#index", "#index"		\n\t"\
433
		ASMALIGN(4)\
434
		"1:				\n\t"\
435
		"movq (%2, "#index"), %%mm2	\n\t" /* uvbuf0[eax]*/\
436
		"movq (%3, "#index"), %%mm3	\n\t" /* uvbuf1[eax]*/\
437
		"movq 4096(%2, "#index"), %%mm5	\n\t" /* uvbuf0[eax+2048]*/\
438
		"movq 4096(%3, "#index"), %%mm4	\n\t" /* uvbuf1[eax+2048]*/\
439
		"psubw %%mm3, %%mm2		\n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
440
		"psubw %%mm4, %%mm5		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
441
		"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
442
		"pmulhw %%mm0, %%mm2		\n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
443
		"pmulhw %%mm0, %%mm5		\n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
444
		"psraw $7, %%mm3		\n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
445
		"psraw $7, %%mm4		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
446
		"paddw %%mm2, %%mm3		\n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
447
		"paddw %%mm5, %%mm4		\n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
448
		"movq (%0, "#index", 2), %%mm0	\n\t" /*buf0[eax]*/\
449
		"movq (%1, "#index", 2), %%mm1	\n\t" /*buf1[eax]*/\
450
		"movq 8(%0, "#index", 2), %%mm6	\n\t" /*buf0[eax]*/\
451
		"movq 8(%1, "#index", 2), %%mm7	\n\t" /*buf1[eax]*/\
452
		"psubw %%mm1, %%mm0		\n\t" /* buf0[eax] - buf1[eax]*/\
453
		"psubw %%mm7, %%mm6		\n\t" /* buf0[eax] - buf1[eax]*/\
454
		"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455
		"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
456
		"psraw $7, %%mm1		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
457
		"psraw $7, %%mm7		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
458
		"paddw %%mm0, %%mm1		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459
		"paddw %%mm6, %%mm7		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
426
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
427
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
428
    "psraw                $3, %%mm0                           \n\t"\
429
    "psraw                $3, %%mm1                           \n\t"\
430
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
431
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
432
    "xor            "#index", "#index"                        \n\t"\
433
    ASMALIGN(4)\
434
    "1:                                 \n\t"\
435
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
436
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
437
    "movq 4096(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
438
    "movq 4096(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
439
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
440
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
441
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
442
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
443
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
444
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
445
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
446
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
447
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
448
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
449
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
450
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
451
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
452
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
453
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
454
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
456
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
458
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
460 460

  
461 461
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
462 462

  
463 463
#define REAL_YSCALEYUV2RGB(index, c) \
464
		"xor "#index", "#index"	\n\t"\
465
		ASMALIGN(4)\
466
		"1:				\n\t"\
467
		"movq (%2, "#index"), %%mm2	\n\t" /* uvbuf0[eax]*/\
468
		"movq (%3, "#index"), %%mm3	\n\t" /* uvbuf1[eax]*/\
469
		"movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
470
		"movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
471
		"psubw %%mm3, %%mm2		\n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
472
		"psubw %%mm4, %%mm5		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
473
		"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
474
		"pmulhw %%mm0, %%mm2		\n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
475
		"pmulhw %%mm0, %%mm5		\n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
476
		"psraw $4, %%mm3		\n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
477
		"psraw $4, %%mm4		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
478
		"paddw %%mm2, %%mm3		\n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
479
		"paddw %%mm5, %%mm4		\n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
480
		"psubw "U_OFFSET"("#c"), %%mm3	\n\t" /* (U-128)8*/\
481
		"psubw "V_OFFSET"("#c"), %%mm4	\n\t" /* (V-128)8*/\
482
		"movq %%mm3, %%mm2		\n\t" /* (U-128)8*/\
483
		"movq %%mm4, %%mm5		\n\t" /* (V-128)8*/\
484
		"pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
485
		"pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
486
	/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
487
		"movq (%0, "#index", 2), %%mm0	\n\t" /*buf0[eax]*/\
488
		"movq (%1, "#index", 2), %%mm1	\n\t" /*buf1[eax]*/\
489
		"movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
490
		"movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
491
		"psubw %%mm1, %%mm0		\n\t" /* buf0[eax] - buf1[eax]*/\
492
		"psubw %%mm7, %%mm6		\n\t" /* buf0[eax] - buf1[eax]*/\
493
		"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
494
		"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
495
		"psraw $4, %%mm1		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
496
		"psraw $4, %%mm7		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
497
		"paddw %%mm0, %%mm1		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
498
		"paddw %%mm6, %%mm7		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
499
		"pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
500
		"pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
501
		"psubw "Y_OFFSET"("#c"), %%mm1	\n\t" /* 8(Y-16)*/\
502
		"psubw "Y_OFFSET"("#c"), %%mm7	\n\t" /* 8(Y-16)*/\
503
		"pmulhw "Y_COEFF"("#c"), %%mm1	\n\t"\
504
		"pmulhw "Y_COEFF"("#c"), %%mm7	\n\t"\
505
	/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
506
		"paddw %%mm3, %%mm4		\n\t"\
507
		"movq %%mm2, %%mm0		\n\t"\
508
		"movq %%mm5, %%mm6		\n\t"\
509
		"movq %%mm4, %%mm3		\n\t"\
510
		"punpcklwd %%mm2, %%mm2		\n\t"\
511
		"punpcklwd %%mm5, %%mm5		\n\t"\
512
		"punpcklwd %%mm4, %%mm4		\n\t"\
513
		"paddw %%mm1, %%mm2		\n\t"\
514
		"paddw %%mm1, %%mm5		\n\t"\
515
		"paddw %%mm1, %%mm4		\n\t"\
516
		"punpckhwd %%mm0, %%mm0		\n\t"\
517
		"punpckhwd %%mm6, %%mm6		\n\t"\
518
		"punpckhwd %%mm3, %%mm3		\n\t"\
519
		"paddw %%mm7, %%mm0		\n\t"\
520
		"paddw %%mm7, %%mm6		\n\t"\
521
		"paddw %%mm7, %%mm3		\n\t"\
522
		/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
523
		"packuswb %%mm0, %%mm2		\n\t"\
524
		"packuswb %%mm6, %%mm5		\n\t"\
525
		"packuswb %%mm3, %%mm4		\n\t"\
526
		"pxor %%mm7, %%mm7		\n\t"
464
    "xor            "#index", "#index"  \n\t"\
465
    ASMALIGN(4)\
466
    "1:                                 \n\t"\
467
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
468
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
469
    "movq 4096(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
470
    "movq 4096(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
471
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
472
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
473
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
474
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
475
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
476
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
477
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
478
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
479
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
480
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
481
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
482
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
483
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
484
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
485
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
486
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
487
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
488
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
489
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
490
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
491
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
492
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
493
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
494
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
495
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
496
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
497
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
498
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
499
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
500
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
501
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
502
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
503
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
504
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
505
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
506
    "paddw             %%mm3, %%mm4     \n\t"\
507
    "movq              %%mm2, %%mm0     \n\t"\
508
    "movq              %%mm5, %%mm6     \n\t"\
509
    "movq              %%mm4, %%mm3     \n\t"\
510
    "punpcklwd         %%mm2, %%mm2     \n\t"\
511
    "punpcklwd         %%mm5, %%mm5     \n\t"\
512
    "punpcklwd         %%mm4, %%mm4     \n\t"\
513
    "paddw             %%mm1, %%mm2     \n\t"\
514
    "paddw             %%mm1, %%mm5     \n\t"\
515
    "paddw             %%mm1, %%mm4     \n\t"\
516
    "punpckhwd         %%mm0, %%mm0     \n\t"\
517
    "punpckhwd         %%mm6, %%mm6     \n\t"\
518
    "punpckhwd         %%mm3, %%mm3     \n\t"\
519
    "paddw             %%mm7, %%mm0     \n\t"\
520
    "paddw             %%mm7, %%mm6     \n\t"\
521
    "paddw             %%mm7, %%mm3     \n\t"\
522
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
523
    "packuswb          %%mm0, %%mm2     \n\t"\
524
    "packuswb          %%mm6, %%mm5     \n\t"\
525
    "packuswb          %%mm3, %%mm4     \n\t"\
526
    "pxor              %%mm7, %%mm7     \n\t"
527 527
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
528 528

  
529 529
#define REAL_YSCALEYUV2PACKED1(index, c) \
530
		"xor "#index", "#index"		\n\t"\
531
		ASMALIGN(4)\
532
		"1:				\n\t"\
533
		"movq (%2, "#index"), %%mm3	\n\t" /* uvbuf0[eax]*/\
534
		"movq 4096(%2, "#index"), %%mm4	\n\t" /* uvbuf0[eax+2048]*/\
535
		"psraw $7, %%mm3		\n\t" \
536
		"psraw $7, %%mm4		\n\t" \
537
		"movq (%0, "#index", 2), %%mm1	\n\t" /*buf0[eax]*/\
538
		"movq 8(%0, "#index", 2), %%mm7	\n\t" /*buf0[eax]*/\
539
		"psraw $7, %%mm1		\n\t" \
540
		"psraw $7, %%mm7		\n\t" \
530
    "xor            "#index", "#index"  \n\t"\
531
    ASMALIGN(4)\
532
    "1:                                 \n\t"\
533
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
534
    "movq 4096(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
535
    "psraw                $7, %%mm3     \n\t" \
536
    "psraw                $7, %%mm4     \n\t" \
537
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
538
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
539
    "psraw                $7, %%mm1     \n\t" \
540
    "psraw                $7, %%mm7     \n\t" \
541 541

  
542 542
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
543 543

  
544 544
#define REAL_YSCALEYUV2RGB1(index, c) \
545
		"xor "#index", "#index"	\n\t"\
546
		ASMALIGN(4)\
547
		"1:				\n\t"\
548
		"movq (%2, "#index"), %%mm3	\n\t" /* uvbuf0[eax]*/\
549
		"movq 4096(%2, "#index"), %%mm4	\n\t" /* uvbuf0[eax+2048]*/\
550
		"psraw $4, %%mm3		\n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
551
		"psraw $4, %%mm4		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
552
		"psubw "U_OFFSET"("#c"), %%mm3	\n\t" /* (U-128)8*/\
553
		"psubw "V_OFFSET"("#c"), %%mm4	\n\t" /* (V-128)8*/\
554
		"movq %%mm3, %%mm2		\n\t" /* (U-128)8*/\
555
		"movq %%mm4, %%mm5		\n\t" /* (V-128)8*/\
556
		"pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
557
		"pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
558
	/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
559
		"movq (%0, "#index", 2), %%mm1	\n\t" /*buf0[eax]*/\
560
		"movq 8(%0, "#index", 2), %%mm7	\n\t" /*buf0[eax]*/\
561
		"psraw $4, %%mm1		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
562
		"psraw $4, %%mm7		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
563
		"pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
564
		"pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
565
		"psubw "Y_OFFSET"("#c"), %%mm1	\n\t" /* 8(Y-16)*/\
566
		"psubw "Y_OFFSET"("#c"), %%mm7	\n\t" /* 8(Y-16)*/\
567
		"pmulhw "Y_COEFF"("#c"), %%mm1	\n\t"\
568
		"pmulhw "Y_COEFF"("#c"), %%mm7	\n\t"\
569
	/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
570
		"paddw %%mm3, %%mm4		\n\t"\
571
		"movq %%mm2, %%mm0		\n\t"\
572
		"movq %%mm5, %%mm6		\n\t"\
573
		"movq %%mm4, %%mm3		\n\t"\
574
		"punpcklwd %%mm2, %%mm2		\n\t"\
575
		"punpcklwd %%mm5, %%mm5		\n\t"\
576
		"punpcklwd %%mm4, %%mm4		\n\t"\
577
		"paddw %%mm1, %%mm2		\n\t"\
578
		"paddw %%mm1, %%mm5		\n\t"\
579
		"paddw %%mm1, %%mm4		\n\t"\
580
		"punpckhwd %%mm0, %%mm0		\n\t"\
581
		"punpckhwd %%mm6, %%mm6		\n\t"\
582
		"punpckhwd %%mm3, %%mm3		\n\t"\
583
		"paddw %%mm7, %%mm0		\n\t"\
584
		"paddw %%mm7, %%mm6		\n\t"\
585
		"paddw %%mm7, %%mm3		\n\t"\
586
		/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
587
		"packuswb %%mm0, %%mm2		\n\t"\
588
		"packuswb %%mm6, %%mm5		\n\t"\
589
		"packuswb %%mm3, %%mm4		\n\t"\
590
		"pxor %%mm7, %%mm7		\n\t"
545
    "xor            "#index", "#index"  \n\t"\
546
    ASMALIGN(4)\
547
    "1:                                 \n\t"\
548
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
549
    "movq 4096(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
550
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
551
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
552
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
553
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
554
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
555
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
556
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
557
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
558
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
559
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
560
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
561
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
562
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
563
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
564
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
565
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
566
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
567
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
568
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
569
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
570
    "paddw             %%mm3, %%mm4     \n\t"\
571
    "movq              %%mm2, %%mm0     \n\t"\
572
    "movq              %%mm5, %%mm6     \n\t"\
573
    "movq              %%mm4, %%mm3     \n\t"\
574
    "punpcklwd         %%mm2, %%mm2     \n\t"\
575
    "punpcklwd         %%mm5, %%mm5     \n\t"\
576
    "punpcklwd         %%mm4, %%mm4     \n\t"\
577
    "paddw             %%mm1, %%mm2     \n\t"\
578
    "paddw             %%mm1, %%mm5     \n\t"\
579
    "paddw             %%mm1, %%mm4     \n\t"\
580
    "punpckhwd         %%mm0, %%mm0     \n\t"\
581
    "punpckhwd         %%mm6, %%mm6     \n\t"\
582
    "punpckhwd         %%mm3, %%mm3     \n\t"\
583
    "paddw             %%mm7, %%mm0     \n\t"\
584
    "paddw             %%mm7, %%mm6     \n\t"\
585
    "paddw             %%mm7, %%mm3     \n\t"\
586
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
587
    "packuswb          %%mm0, %%mm2     \n\t"\
588
    "packuswb          %%mm6, %%mm5     \n\t"\
589
    "packuswb          %%mm3, %%mm4     \n\t"\
590
    "pxor              %%mm7, %%mm7     \n\t"
591 591
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
592 592

  
593 593
#define REAL_YSCALEYUV2PACKED1b(index, c) \
594
		"xor "#index", "#index"		\n\t"\
595
		ASMALIGN(4)\
596
		"1:				\n\t"\
597
		"movq (%2, "#index"), %%mm2	\n\t" /* uvbuf0[eax]*/\
598
		"movq (%3, "#index"), %%mm3	\n\t" /* uvbuf1[eax]*/\
599
		"movq 4096(%2, "#index"), %%mm5	\n\t" /* uvbuf0[eax+2048]*/\
600
		"movq 4096(%3, "#index"), %%mm4	\n\t" /* uvbuf1[eax+2048]*/\
601
		"paddw %%mm2, %%mm3		\n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
602
		"paddw %%mm5, %%mm4		\n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
603
		"psrlw $8, %%mm3		\n\t" \
604
		"psrlw $8, %%mm4		\n\t" \
605
		"movq (%0, "#index", 2), %%mm1	\n\t" /*buf0[eax]*/\
606
		"movq 8(%0, "#index", 2), %%mm7	\n\t" /*buf0[eax]*/\
607
		"psraw $7, %%mm1		\n\t" \
608
		"psraw $7, %%mm7		\n\t"
594
    "xor "#index", "#index"             \n\t"\
595
    ASMALIGN(4)\
596
    "1:                                 \n\t"\
597
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
598
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
599
    "movq 4096(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
600
    "movq 4096(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
601
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
602
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
603
    "psrlw                $8, %%mm3     \n\t" \
604
    "psrlw                $8, %%mm4     \n\t" \
605
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
606
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
607
    "psraw                $7, %%mm1     \n\t" \
608
    "psraw                $7, %%mm7     \n\t"
609 609
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
610 610

  
611 611
// do vertical chrominance interpolation
612 612
#define REAL_YSCALEYUV2RGB1b(index, c) \
613
		"xor "#index", "#index"		\n\t"\
614
		ASMALIGN(4)\
615
		"1:				\n\t"\
616
		"movq (%2, "#index"), %%mm2	\n\t" /* uvbuf0[eax]*/\
617
		"movq (%3, "#index"), %%mm3	\n\t" /* uvbuf1[eax]*/\
618
		"movq 4096(%2, "#index"), %%mm5	\n\t" /* uvbuf0[eax+2048]*/\
619
		"movq 4096(%3, "#index"), %%mm4	\n\t" /* uvbuf1[eax+2048]*/\
620
		"paddw %%mm2, %%mm3		\n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
621
		"paddw %%mm5, %%mm4		\n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
622
		"psrlw $5, %%mm3		\n\t" /*FIXME might overflow*/\
623
		"psrlw $5, %%mm4		\n\t" /*FIXME might overflow*/\
624
		"psubw "U_OFFSET"("#c"), %%mm3	\n\t" /* (U-128)8*/\
625
		"psubw "V_OFFSET"("#c"), %%mm4	\n\t" /* (V-128)8*/\
626
		"movq %%mm3, %%mm2		\n\t" /* (U-128)8*/\
627
		"movq %%mm4, %%mm5		\n\t" /* (V-128)8*/\
628
		"pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
629
		"pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
630
	/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
631
		"movq (%0, "#index", 2), %%mm1	\n\t" /*buf0[eax]*/\
632
		"movq 8(%0, "#index", 2), %%mm7	\n\t" /*buf0[eax]*/\
633
		"psraw $4, %%mm1		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
634
		"psraw $4, %%mm7		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
635
		"pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
636
		"pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
637
		"psubw "Y_OFFSET"("#c"), %%mm1	\n\t" /* 8(Y-16)*/\
638
		"psubw "Y_OFFSET"("#c"), %%mm7	\n\t" /* 8(Y-16)*/\
639
		"pmulhw "Y_COEFF"("#c"), %%mm1	\n\t"\
640
		"pmulhw "Y_COEFF"("#c"), %%mm7	\n\t"\
641
	/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
642
		"paddw %%mm3, %%mm4		\n\t"\
643
		"movq %%mm2, %%mm0		\n\t"\
644
		"movq %%mm5, %%mm6		\n\t"\
645
		"movq %%mm4, %%mm3		\n\t"\
646
		"punpcklwd %%mm2, %%mm2		\n\t"\
647
		"punpcklwd %%mm5, %%mm5		\n\t"\
648
		"punpcklwd %%mm4, %%mm4		\n\t"\
649
		"paddw %%mm1, %%mm2		\n\t"\
650
		"paddw %%mm1, %%mm5		\n\t"\
651
		"paddw %%mm1, %%mm4		\n\t"\
652
		"punpckhwd %%mm0, %%mm0		\n\t"\
653
		"punpckhwd %%mm6, %%mm6		\n\t"\
654
		"punpckhwd %%mm3, %%mm3		\n\t"\
655
		"paddw %%mm7, %%mm0		\n\t"\
656
		"paddw %%mm7, %%mm6		\n\t"\
657
		"paddw %%mm7, %%mm3		\n\t"\
658
		/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
659
		"packuswb %%mm0, %%mm2		\n\t"\
660
		"packuswb %%mm6, %%mm5		\n\t"\
661
		"packuswb %%mm3, %%mm4		\n\t"\
662
		"pxor %%mm7, %%mm7		\n\t"
613
    "xor            "#index", "#index"  \n\t"\
614
    ASMALIGN(4)\
615
    "1:                                 \n\t"\
616
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
617
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
618
    "movq 4096(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
619
    "movq 4096(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
620
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
621
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
622
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
623
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
624
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
625
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
626
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
627
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
628
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
629
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
630
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
631
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
632
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
633
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
634
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
635
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
636
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
637
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
638
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
639
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
640
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
641
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
642
    "paddw             %%mm3, %%mm4     \n\t"\
643
    "movq              %%mm2, %%mm0     \n\t"\
644
    "movq              %%mm5, %%mm6     \n\t"\
645
    "movq              %%mm4, %%mm3     \n\t"\
646
    "punpcklwd         %%mm2, %%mm2     \n\t"\
647
    "punpcklwd         %%mm5, %%mm5     \n\t"\
648
    "punpcklwd         %%mm4, %%mm4     \n\t"\
649
    "paddw             %%mm1, %%mm2     \n\t"\
650
    "paddw             %%mm1, %%mm5     \n\t"\
651
    "paddw             %%mm1, %%mm4     \n\t"\
652
    "punpckhwd         %%mm0, %%mm0     \n\t"\
653
    "punpckhwd         %%mm6, %%mm6     \n\t"\
654
    "punpckhwd         %%mm3, %%mm3     \n\t"\
655
    "paddw             %%mm7, %%mm0     \n\t"\
656
    "paddw             %%mm7, %%mm6     \n\t"\
657
    "paddw             %%mm7, %%mm3     \n\t"\
658
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
659
    "packuswb          %%mm0, %%mm2     \n\t"\
660
    "packuswb          %%mm6, %%mm5     \n\t"\
661
    "packuswb          %%mm3, %%mm4     \n\t"\
662
    "pxor              %%mm7, %%mm7     \n\t"
663 663
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
664 664

  
665 665
#define REAL_WRITEBGR32(dst, dstw, index) \
666
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
667
			"movq %%mm2, %%mm1		\n\t" /* B */\
668
			"movq %%mm5, %%mm6		\n\t" /* R */\
669
			"punpcklbw %%mm4, %%mm2		\n\t" /* GBGBGBGB 0 */\
670
			"punpcklbw %%mm7, %%mm5		\n\t" /* 0R0R0R0R 0 */\
671
			"punpckhbw %%mm4, %%mm1		\n\t" /* GBGBGBGB 2 */\
672
			"punpckhbw %%mm7, %%mm6		\n\t" /* 0R0R0R0R 2 */\
673
			"movq %%mm2, %%mm0		\n\t" /* GBGBGBGB 0 */\
674
			"movq %%mm1, %%mm3		\n\t" /* GBGBGBGB 2 */\
675
			"punpcklwd %%mm5, %%mm0		\n\t" /* 0RGB0RGB 0 */\
676
			"punpckhwd %%mm5, %%mm2		\n\t" /* 0RGB0RGB 1 */\
677
			"punpcklwd %%mm6, %%mm1		\n\t" /* 0RGB0RGB 2 */\
678
			"punpckhwd %%mm6, %%mm3		\n\t" /* 0RGB0RGB 3 */\
666
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
667
    "movq      %%mm2, %%mm1     \n\t" /* B */\
668
    "movq      %%mm5, %%mm6     \n\t" /* R */\
669
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
670
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
671
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
672
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
673
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
674
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
675
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
676
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
677
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
678
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
679 679
\
680
			MOVNTQ(%%mm0, (dst, index, 4))\
681
			MOVNTQ(%%mm2, 8(dst, index, 4))\
682
			MOVNTQ(%%mm1, 16(dst, index, 4))\
683
			MOVNTQ(%%mm3, 24(dst, index, 4))\
680
    MOVNTQ(%%mm0,   (dst, index, 4))\
681
    MOVNTQ(%%mm2,  8(dst, index, 4))\
682
    MOVNTQ(%%mm1, 16(dst, index, 4))\
683
    MOVNTQ(%%mm3, 24(dst, index, 4))\
684 684
\
685
			"add $8, "#index"		\n\t"\
686
			"cmp "#dstw", "#index"		\n\t"\
687
			" jb 1b				\n\t"
685
    "add      $8, "#index"      \n\t"\
686
    "cmp "#dstw", "#index"      \n\t"\
687
    " jb      1b                \n\t"
688 688
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
689 689

  
690 690
#define REAL_WRITEBGR16(dst, dstw, index) \
691
			"pand "MANGLE(bF8)", %%mm2	\n\t" /* B */\
692
			"pand "MANGLE(bFC)", %%mm4	\n\t" /* G */\
693
			"pand "MANGLE(bF8)", %%mm5	\n\t" /* R */\
694
			"psrlq $3, %%mm2		\n\t"\
691
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
692
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
693
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
694
    "psrlq           $3, %%mm2  \n\t"\
695 695
\
696
			"movq %%mm2, %%mm1		\n\t"\
697
			"movq %%mm4, %%mm3		\n\t"\
696
    "movq         %%mm2, %%mm1  \n\t"\
697
    "movq         %%mm4, %%mm3  \n\t"\
698 698
\
699
			"punpcklbw %%mm7, %%mm3		\n\t"\
700
			"punpcklbw %%mm5, %%mm2		\n\t"\
701
			"punpckhbw %%mm7, %%mm4		\n\t"\
702
			"punpckhbw %%mm5, %%mm1		\n\t"\
699
    "punpcklbw    %%mm7, %%mm3  \n\t"\
700
    "punpcklbw    %%mm5, %%mm2  \n\t"\
701
    "punpckhbw    %%mm7, %%mm4  \n\t"\
702
    "punpckhbw    %%mm5, %%mm1  \n\t"\
703 703
\
704
			"psllq $3, %%mm3		\n\t"\
705
			"psllq $3, %%mm4		\n\t"\
704
    "psllq           $3, %%mm3  \n\t"\
705
    "psllq           $3, %%mm4  \n\t"\
706 706
\
707
			"por %%mm3, %%mm2		\n\t"\
708
			"por %%mm4, %%mm1		\n\t"\
707
    "por          %%mm3, %%mm2  \n\t"\
708
    "por          %%mm4, %%mm1  \n\t"\
709 709
\
710
			MOVNTQ(%%mm2, (dst, index, 2))\
711
			MOVNTQ(%%mm1, 8(dst, index, 2))\
710
    MOVNTQ(%%mm2,  (dst, index, 2))\
711
    MOVNTQ(%%mm1, 8(dst, index, 2))\
712 712
\
713
			"add $8, "#index"		\n\t"\
714
			"cmp "#dstw", "#index"		\n\t"\
715
			" jb 1b				\n\t"
713
    "add             $8, "#index"   \n\t"\
714
    "cmp        "#dstw", "#index"   \n\t"\
715
    " jb             1b             \n\t"
716 716
#define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
717 717

  
718 718
#define REAL_WRITEBGR15(dst, dstw, index) \
719
			"pand "MANGLE(bF8)", %%mm2	\n\t" /* B */\
720
			"pand "MANGLE(bF8)", %%mm4	\n\t" /* G */\
721
			"pand "MANGLE(bF8)", %%mm5	\n\t" /* R */\
722
			"psrlq $3, %%mm2		\n\t"\
723
			"psrlq $1, %%mm5		\n\t"\
719
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
720
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
721
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
722
    "psrlq           $3, %%mm2  \n\t"\
723
    "psrlq           $1, %%mm5  \n\t"\
724 724
\
725
			"movq %%mm2, %%mm1		\n\t"\
726
			"movq %%mm4, %%mm3		\n\t"\
725
    "movq         %%mm2, %%mm1  \n\t"\
726
    "movq         %%mm4, %%mm3  \n\t"\
727 727
\
728
			"punpcklbw %%mm7, %%mm3		\n\t"\
729
			"punpcklbw %%mm5, %%mm2		\n\t"\
730
			"punpckhbw %%mm7, %%mm4		\n\t"\
731
			"punpckhbw %%mm5, %%mm1		\n\t"\
728
    "punpcklbw    %%mm7, %%mm3  \n\t"\
729
    "punpcklbw    %%mm5, %%mm2  \n\t"\
730
    "punpckhbw    %%mm7, %%mm4  \n\t"\
731
    "punpckhbw    %%mm5, %%mm1  \n\t"\
732 732
\
733
			"psllq $2, %%mm3		\n\t"\
734
			"psllq $2, %%mm4		\n\t"\
733
    "psllq           $2, %%mm3  \n\t"\
734
    "psllq           $2, %%mm4  \n\t"\
735 735
\
736
			"por %%mm3, %%mm2		\n\t"\
737
			"por %%mm4, %%mm1		\n\t"\
736
    "por          %%mm3, %%mm2  \n\t"\
737
    "por          %%mm4, %%mm1  \n\t"\
738 738
\
739
			MOVNTQ(%%mm2, (dst, index, 2))\
740
			MOVNTQ(%%mm1, 8(dst, index, 2))\
739
    MOVNTQ(%%mm2,  (dst, index, 2))\
740
    MOVNTQ(%%mm1, 8(dst, index, 2))\
741 741
\
742
			"add $8, "#index"		\n\t"\
743
			"cmp "#dstw", "#index"		\n\t"\
744
			" jb 1b				\n\t"
742
    "add             $8, "#index"   \n\t"\
743
    "cmp        "#dstw", "#index"   \n\t"\
744
    " jb             1b             \n\t"
745 745
#define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
746 746

  
747 747
#define WRITEBGR24OLD(dst, dstw, index) \
748
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
749
			"movq %%mm2, %%mm1		\n\t" /* B */\
750
			"movq %%mm5, %%mm6		\n\t" /* R */\
751
			"punpcklbw %%mm4, %%mm2		\n\t" /* GBGBGBGB 0 */\
752
			"punpcklbw %%mm7, %%mm5		\n\t" /* 0R0R0R0R 0 */\
753
			"punpckhbw %%mm4, %%mm1		\n\t" /* GBGBGBGB 2 */\
754
			"punpckhbw %%mm7, %%mm6		\n\t" /* 0R0R0R0R 2 */\
755
			"movq %%mm2, %%mm0		\n\t" /* GBGBGBGB 0 */\
756
			"movq %%mm1, %%mm3		\n\t" /* GBGBGBGB 2 */\
757
			"punpcklwd %%mm5, %%mm0		\n\t" /* 0RGB0RGB 0 */\
758
			"punpckhwd %%mm5, %%mm2		\n\t" /* 0RGB0RGB 1 */\
759
			"punpcklwd %%mm6, %%mm1		\n\t" /* 0RGB0RGB 2 */\
760
			"punpckhwd %%mm6, %%mm3		\n\t" /* 0RGB0RGB 3 */\
748
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
749
    "movq      %%mm2, %%mm1             \n\t" /* B */\
750
    "movq      %%mm5, %%mm6             \n\t" /* R */\
751
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
752
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
753
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
754
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
755
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
756
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
757
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
758
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
759
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
760
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
761 761
\
762
			"movq %%mm0, %%mm4		\n\t" /* 0RGB0RGB 0 */\
763
			"psrlq $8, %%mm0		\n\t" /* 00RGB0RG 0 */\
764
			"pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
765
			"pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
766
			"por %%mm4, %%mm0		\n\t" /* 00RGBRGB 0 */\
767
			"movq %%mm2, %%mm4		\n\t" /* 0RGB0RGB 1 */\
768
			"psllq $48, %%mm2		\n\t" /* GB000000 1 */\
769
			"por %%mm2, %%mm0		\n\t" /* GBRGBRGB 0 */\
762
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
763
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
764
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
765
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
766
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
767
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
768
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
769
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
770 770
\
771
			"movq %%mm4, %%mm2		\n\t" /* 0RGB0RGB 1 */\
772
			"psrld $16, %%mm4		\n\t" /* 000R000R 1 */\
773
			"psrlq $24, %%mm2		\n\t" /* 0000RGB0 1.5 */\
774
			"por %%mm4, %%mm2		\n\t" /* 000RRGBR 1 */\
775
			"pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
776
			"movq %%mm1, %%mm4		\n\t" /* 0RGB0RGB 2 */\
777
			"psrlq $8, %%mm1		\n\t" /* 00RGB0RG 2 */\
778
			"pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
779
			"pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
780
			"por %%mm4, %%mm1		\n\t" /* 00RGBRGB 2 */\
781
			"movq %%mm1, %%mm4		\n\t" /* 00RGBRGB 2 */\
782
			"psllq $32, %%mm1		\n\t" /* BRGB0000 2 */\
783
			"por %%mm1, %%mm2		\n\t" /* BRGBRGBR 1 */\
771
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
772
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
773
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
774
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
775
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
776
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
777
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
778
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
779
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
780
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
781
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
782
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
783
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
784 784
\
785
			"psrlq $32, %%mm4		\n\t" /* 000000RG 2.5 */\
786
			"movq %%mm3, %%mm5		\n\t" /* 0RGB0RGB 3 */\
787
			"psrlq $8, %%mm3		\n\t" /* 00RGB0RG 3 */\
788
			"pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
789
			"pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
790
			"por %%mm5, %%mm3		\n\t" /* 00RGBRGB 3 */\
791
			"psllq $16, %%mm3		\n\t" /* RGBRGB00 3 */\
792
			"por %%mm4, %%mm3		\n\t" /* RGBRGBRG 2.5 */\
785
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
786
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
787
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
788
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
789
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
790
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
791
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
792
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
793 793
\
794
			MOVNTQ(%%mm0, (dst))\
795
			MOVNTQ(%%mm2, 8(dst))\
796
			MOVNTQ(%%mm3, 16(dst))\
797
			"add $24, "#dst"		\n\t"\
794
    MOVNTQ(%%mm0,   (dst))\
795
    MOVNTQ(%%mm2,  8(dst))\
796
    MOVNTQ(%%mm3, 16(dst))\
797
    "add         $24, "#dst"            \n\t"\
798 798
\
799
			"add $8, "#index"		\n\t"\
800
			"cmp "#dstw", "#index"		\n\t"\
801
			" jb 1b				\n\t"
799
    "add          $8, "#index"          \n\t"\
800
    "cmp     "#dstw", "#index"          \n\t"\
801
    " jb          1b                    \n\t"
802 802

  
803 803
#define WRITEBGR24MMX(dst, dstw, index) \
804
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
805
			"movq %%mm2, %%mm1		\n\t" /* B */\
806
			"movq %%mm5, %%mm6		\n\t" /* R */\
807
			"punpcklbw %%mm4, %%mm2		\n\t" /* GBGBGBGB 0 */\
808
			"punpcklbw %%mm7, %%mm5		\n\t" /* 0R0R0R0R 0 */\
809
			"punpckhbw %%mm4, %%mm1		\n\t" /* GBGBGBGB 2 */\
810
			"punpckhbw %%mm7, %%mm6		\n\t" /* 0R0R0R0R 2 */\
811
			"movq %%mm2, %%mm0		\n\t" /* GBGBGBGB 0 */\
812
			"movq %%mm1, %%mm3		\n\t" /* GBGBGBGB 2 */\
813
			"punpcklwd %%mm5, %%mm0		\n\t" /* 0RGB0RGB 0 */\
814
			"punpckhwd %%mm5, %%mm2		\n\t" /* 0RGB0RGB 1 */\
815
			"punpcklwd %%mm6, %%mm1		\n\t" /* 0RGB0RGB 2 */\
816
			"punpckhwd %%mm6, %%mm3		\n\t" /* 0RGB0RGB 3 */\
804
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
805
    "movq      %%mm2, %%mm1     \n\t" /* B */\
806
    "movq      %%mm5, %%mm6     \n\t" /* R */\
807
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
808
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
809
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
810
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
811
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
812
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
813
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
814
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
815
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
816
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
817 817
\
818
			"movq %%mm0, %%mm4		\n\t" /* 0RGB0RGB 0 */\
819
			"movq %%mm2, %%mm6		\n\t" /* 0RGB0RGB 1 */\
820
			"movq %%mm1, %%mm5		\n\t" /* 0RGB0RGB 2 */\
821
			"movq %%mm3, %%mm7		\n\t" /* 0RGB0RGB 3 */\
818
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
819
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
820
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
821
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
822 822
\
823
			"psllq $40, %%mm0		\n\t" /* RGB00000 0 */\
824
			"psllq $40, %%mm2		\n\t" /* RGB00000 1 */\
825
			"psllq $40, %%mm1		\n\t" /* RGB00000 2 */\
826
			"psllq $40, %%mm3		\n\t" /* RGB00000 3 */\
823
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
824
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
825
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
826
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
827 827
\
828
			"punpckhdq %%mm4, %%mm0		\n\t" /* 0RGBRGB0 0 */\
829
			"punpckhdq %%mm6, %%mm2		\n\t" /* 0RGBRGB0 1 */\
830
			"punpckhdq %%mm5, %%mm1		\n\t" /* 0RGBRGB0 2 */\
831
			"punpckhdq %%mm7, %%mm3		\n\t" /* 0RGBRGB0 3 */\
828
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
829
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
830
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
831
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
832 832
\
833
			"psrlq $8, %%mm0		\n\t" /* 00RGBRGB 0 */\
834
			"movq %%mm2, %%mm6		\n\t" /* 0RGBRGB0 1 */\
835
			"psllq $40, %%mm2		\n\t" /* GB000000 1 */\
836
			"por %%mm2, %%mm0		\n\t" /* GBRGBRGB 0 */\
837
			MOVNTQ(%%mm0, (dst))\
833
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
834
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
835
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
836
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
837
    MOVNTQ(%%mm0, (dst))\
838 838
\
839
			"psrlq $24, %%mm6		\n\t" /* 0000RGBR 1 */\
840
			"movq %%mm1, %%mm5		\n\t" /* 0RGBRGB0 2 */\
841
			"psllq $24, %%mm1		\n\t" /* BRGB0000 2 */\
842
			"por %%mm1, %%mm6		\n\t" /* BRGBRGBR 1 */\
843
			MOVNTQ(%%mm6, 8(dst))\
839
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
840
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
841
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
842
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
843
    MOVNTQ(%%mm6, 8(dst))\
844 844
\
845
			"psrlq $40, %%mm5		\n\t" /* 000000RG 2 */\
846
			"psllq $8, %%mm3		\n\t" /* RGBRGB00 3 */\
847
			"por %%mm3, %%mm5		\n\t" /* RGBRGBRG 2 */\
848
			MOVNTQ(%%mm5, 16(dst))\
845
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
846
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
847
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
848
    MOVNTQ(%%mm5, 16(dst))\
849 849
\
850
			"add $24, "#dst"		\n\t"\
850
    "add         $24, "#dst"    \n\t"\
851 851
\
852
			"add $8, "#index"			\n\t"\
853
			"cmp "#dstw", "#index"			\n\t"\
854
			" jb 1b				\n\t"
852
    "add          $8, "#index"  \n\t"\
853
    "cmp     "#dstw", "#index"  \n\t"\
854
    " jb          1b            \n\t"
855 855

  
856 856
#define WRITEBGR24MMX2(dst, dstw, index) \
857
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
858
			"movq "MANGLE(M24A)", %%mm0	\n\t"\
859
			"movq "MANGLE(M24C)", %%mm7	\n\t"\
860
			"pshufw $0x50, %%mm2, %%mm1	\n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
861
			"pshufw $0x50, %%mm4, %%mm3	\n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff