Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_deblock.asm @ fbb6b49d

History | View | Annotate | Download (22.4 KB)

1 e27ad118 Jason Garrett-Glaser
;*****************************************************************************
2
;* MMX/SSE2-optimized H.264 deblocking code
3
;*****************************************************************************
4
;* Copyright (C) 2005-2008 x264 project
5
;*
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 2c166c3a Ronald S. Bultje
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
8 e27ad118 Jason Garrett-Glaser
;*
9 c7b1d976 Loren Merritt
;* This file is part of FFmpeg.
10 e27ad118 Jason Garrett-Glaser
;*
11 c7b1d976 Loren Merritt
;* FFmpeg is free software; you can redistribute it and/or
12
;* modify it under the terms of the GNU Lesser General Public
13
;* License as published by the Free Software Foundation; either
14
;* version 2.1 of the License, or (at your option) any later version.
15
;*
16
;* FFmpeg is distributed in the hope that it will be useful,
17 e27ad118 Jason Garrett-Glaser
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 c7b1d976 Loren Merritt
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19
;* Lesser General Public License for more details.
20 e27ad118 Jason Garrett-Glaser
;*
21 c7b1d976 Loren Merritt
;* You should have received a copy of the GNU Lesser General Public
22
;* License along with FFmpeg; if not, write to the Free Software
23
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24
;******************************************************************************
25 e27ad118 Jason Garrett-Glaser
26
%include "x86inc.asm"
27 2c166c3a Ronald S. Bultje
%include "x86util.asm"
28 e27ad118 Jason Garrett-Glaser
29
SECTION_RODATA
30 2c166c3a Ronald S. Bultje
31
cextern pb_0
32
cextern pb_1
33
cextern pb_3
34
cextern pb_A1
35 e27ad118 Jason Garrett-Glaser
36
SECTION .text
37
38
; expands to [base],...,[base+7*stride]
39
%define PASS8ROWS(base, base3, stride, stride3) \
40
    [base], [base+stride], [base+stride*2], [base3], \
41
    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
42
43
; in: 8 rows of 4 bytes in %1..%8
44
; out: 4 rows of 8 bytes in m0..m3
45
%macro TRANSPOSE4x8_LOAD 8
46
    movd       m0, %1
47
    movd       m2, %2
48
    movd       m1, %3
49
    movd       m3, %4
50
    punpcklbw  m0, m2
51
    punpcklbw  m1, m3
52
    movq       m2, m0
53
    punpcklwd  m0, m1
54
    punpckhwd  m2, m1
55
56
    movd       m4, %5
57
    movd       m6, %6
58
    movd       m5, %7
59
    movd       m7, %8
60
    punpcklbw  m4, m6
61
    punpcklbw  m5, m7
62
    movq       m6, m4
63
    punpcklwd  m4, m5
64
    punpckhwd  m6, m5
65
66
    movq       m1, m0
67
    movq       m3, m2
68
    punpckldq  m0, m4
69
    punpckhdq  m1, m4
70
    punpckldq  m2, m6
71
    punpckhdq  m3, m6
72
%endmacro
73
74
; in: 4 rows of 8 bytes in m0..m3
75
; out: 8 rows of 4 bytes in %1..%8
76
%macro TRANSPOSE8x4_STORE 8
77
    movq       m4, m0
78
    movq       m5, m1
79
    movq       m6, m2
80
    punpckhdq  m4, m4
81
    punpckhdq  m5, m5
82
    punpckhdq  m6, m6
83
84
    punpcklbw  m0, m1
85
    punpcklbw  m2, m3
86
    movq       m1, m0
87
    punpcklwd  m0, m2
88
    punpckhwd  m1, m2
89
    movd       %1, m0
90
    punpckhdq  m0, m0
91
    movd       %2, m0
92
    movd       %3, m1
93
    punpckhdq  m1, m1
94
    movd       %4, m1
95
96
    punpckhdq  m3, m3
97
    punpcklbw  m4, m5
98
    punpcklbw  m6, m3
99
    movq       m5, m4
100
    punpcklwd  m4, m6
101
    punpckhwd  m5, m6
102
    movd       %5, m4
103
    punpckhdq  m4, m4
104
    movd       %6, m4
105
    movd       %7, m5
106
    punpckhdq  m5, m5
107
    movd       %8, m5
108
%endmacro
109
110 2c166c3a Ronald S. Bultje
%macro SBUTTERFLY3 4
111 e27ad118 Jason Garrett-Glaser
    movq       %4, %2
112
    punpckl%1  %2, %3
113
    punpckh%1  %4, %3
114
%endmacro
115
116
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
117
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
118
%macro TRANSPOSE6x8_MEM 9
119
    movq  m0, %1
120
    movq  m1, %2
121
    movq  m2, %3
122
    movq  m3, %4
123
    movq  m4, %5
124
    movq  m5, %6
125
    movq  m6, %7
126 2c166c3a Ronald S. Bultje
    SBUTTERFLY3 bw, m0, m1, m7
127
    SBUTTERFLY3 bw, m2, m3, m1
128
    SBUTTERFLY3 bw, m4, m5, m3
129 e27ad118 Jason Garrett-Glaser
    movq  [%9+0x10], m1
130 2c166c3a Ronald S. Bultje
    SBUTTERFLY3 bw, m6, %8, m5
131
    SBUTTERFLY3 wd, m0, m2, m1
132
    SBUTTERFLY3 wd, m4, m6, m2
133 e27ad118 Jason Garrett-Glaser
    punpckhdq m0, m4
134
    movq  [%9+0x00], m0
135 2c166c3a Ronald S. Bultje
    SBUTTERFLY3 wd, m7, [%9+0x10], m6
136
    SBUTTERFLY3 wd, m3, m5, m4
137
    SBUTTERFLY3 dq, m7, m3, m0
138
    SBUTTERFLY3 dq, m1, m2, m5
139 e27ad118 Jason Garrett-Glaser
    punpckldq m6, m4
140
    movq  [%9+0x10], m1
141
    movq  [%9+0x20], m5
142
    movq  [%9+0x30], m7
143
    movq  [%9+0x40], m0
144
    movq  [%9+0x50], m6
145
%endmacro
146
147
; in: 8 rows of 8 in %1..%8
148
; out: 8 rows of 8 in %9..%16
149
%macro TRANSPOSE8x8_MEM 16
150
    movq  m0, %1
151
    movq  m1, %2
152
    movq  m2, %3
153
    movq  m3, %4
154
    movq  m4, %5
155
    movq  m5, %6
156
    movq  m6, %7
157 2c166c3a Ronald S. Bultje
    SBUTTERFLY3 bw, m0, m1, m7
158
    SBUTTERFLY3 bw, m2, m3, m1
159
    SBUTTERFLY3 bw, m4, m5, m3
160
    SBUTTERFLY3 bw, m6, %8, m5
161 e27ad118 Jason Garrett-Glaser
    movq  %9,  m3
162 2c166c3a Ronald S. Bultje
    SBUTTERFLY3 wd, m0, m2, m3
163
    SBUTTERFLY3 wd, m4, m6, m2
164
    SBUTTERFLY3 wd, m7, m1, m6
165 e27ad118 Jason Garrett-Glaser
    movq  %11, m2
166
    movq  m2,  %9
167 2c166c3a Ronald S. Bultje
    SBUTTERFLY3 wd, m2, m5, m1
168
    SBUTTERFLY3 dq, m0, m4, m5
169
    SBUTTERFLY3 dq, m7, m2, m4
170 e27ad118 Jason Garrett-Glaser
    movq  %9,  m0
171
    movq  %10, m5
172
    movq  %13, m7
173
    movq  %14, m4
174 2c166c3a Ronald S. Bultje
    SBUTTERFLY3 dq, m3, %11, m0
175
    SBUTTERFLY3 dq, m6, m1, m5
176 e27ad118 Jason Garrett-Glaser
    movq  %11, m3
177
    movq  %12, m0
178
    movq  %15, m6
179
    movq  %16, m5
180
%endmacro
181
182
; out: %4 = |%1-%2|>%3
183
; clobbers: %5
184
%macro DIFF_GT 5
185
    mova    %5, %2
186
    mova    %4, %1
187
    psubusb %5, %1
188
    psubusb %4, %2
189
    por     %4, %5
190
    psubusb %4, %3
191
%endmacro
192
193
; out: %4 = |%1-%2|>%3
194
; clobbers: %5
195
%macro DIFF_GT2 5
196
    mova    %5, %2
197
    mova    %4, %1
198
    psubusb %5, %1
199
    psubusb %4, %2
200
    psubusb %5, %3
201
    psubusb %4, %3
202
    pcmpeqb %4, %5
203
%endmacro
204
205
%macro SPLATW 1
206
%ifidn m0, xmm0
207
    pshuflw  %1, %1, 0
208
    punpcklqdq %1, %1
209
%else
210
    pshufw   %1, %1, 0
211
%endif
212
%endmacro
213
214
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
215
; out: m5=beta-1, m7=mask, %3=alpha-1
216
; clobbers: m4,m6
217
%macro LOAD_MASK 2-3
218
    movd     m4, %1
219
    movd     m5, %2
220
    SPLATW   m4
221
    SPLATW   m5
222
    packuswb m4, m4  ; 16x alpha-1
223
    packuswb m5, m5  ; 16x beta-1
224
%if %0>2
225
    mova     %3, m4
226
%endif
227
    DIFF_GT  m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
228
    DIFF_GT  m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
229
    por      m7, m4
230
    DIFF_GT  m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
231
    por      m7, m4
232
    pxor     m6, m6
233
    pcmpeqb  m7, m6
234
%endmacro
235
236
; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
237
; out: m1=p0' m2=q0'
238
; clobbers: m0,3-6
239
%macro DEBLOCK_P0_Q0 0
240
    mova    m5, m1
241 2c166c3a Ronald S. Bultje
    pxor    m5, m2       ; p0^q0
242
    pand    m5, [pb_1]   ; (p0^q0)&1
243 e27ad118 Jason Garrett-Glaser
    pcmpeqb m4, m4
244
    pxor    m3, m4
245 2c166c3a Ronald S. Bultje
    pavgb   m3, m0       ; (p1 - q1 + 256)>>1
246
    pavgb   m3, [pb_3]   ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
247 e27ad118 Jason Garrett-Glaser
    pxor    m4, m1
248 2c166c3a Ronald S. Bultje
    pavgb   m4, m2       ; (q0 - p0 + 256)>>1
249 e27ad118 Jason Garrett-Glaser
    pavgb   m3, m5
250 2c166c3a Ronald S. Bultje
    paddusb m3, m4       ; d+128+33
251
    mova    m6, [pb_A1]
252 e27ad118 Jason Garrett-Glaser
    psubusb m6, m3
253 2c166c3a Ronald S. Bultje
    psubusb m3, [pb_A1]
254 e27ad118 Jason Garrett-Glaser
    pminub  m6, m7
255
    pminub  m3, m7
256
    psubusb m1, m6
257
    psubusb m2, m3
258
    paddusb m1, m3
259
    paddusb m2, m6
260
%endmacro
261
262
; in: m1=p0 m2=q0
263
;     %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
264
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
265
; clobbers: q2, tmp, tc0
266
%macro LUMA_Q1 6
267
    mova    %6, m1
268
    pavgb   %6, m2
269 2c166c3a Ronald S. Bultje
    pavgb   %2, %6       ; avg(p2,avg(p0,q0))
270 e27ad118 Jason Garrett-Glaser
    pxor    %6, %3
271 2c166c3a Ronald S. Bultje
    pand    %6, [pb_1]   ; (p2^avg(p0,q0))&1
272
    psubusb %2, %6       ; (p2+((p0+q0+1)>>1))>>1
273 e27ad118 Jason Garrett-Glaser
    mova    %6, %1
274
    psubusb %6, %5
275
    paddusb %5, %1
276
    pmaxub  %2, %6
277
    pminub  %2, %5
278
    mova    %4, %2
279
%endmacro
280
281
%ifdef ARCH_X86_64
282
;-----------------------------------------------------------------------------
283
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
284
;-----------------------------------------------------------------------------
285
INIT_XMM
286 3f87f39c John Adcock
cglobal x264_deblock_v_luma_sse2, 5,5,10
287 e27ad118 Jason Garrett-Glaser
    movd    m8, [r4] ; tc0
288
    lea     r4, [r1*3]
289
    dec     r2d        ; alpha-1
290
    neg     r4
291
    dec     r3d        ; beta-1
292
    add     r4, r0     ; pix-3*stride
293
294
    mova    m0, [r4+r1]   ; p1
295
    mova    m1, [r4+2*r1] ; p0
296
    mova    m2, [r0]      ; q0
297
    mova    m3, [r0+r1]   ; q1
298
    LOAD_MASK r2d, r3d
299
300
    punpcklbw m8, m8
301
    punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
302
    pcmpeqb m9, m9
303
    pcmpeqb m9, m8
304
    pandn   m9, m7
305
    pand    m8, m9
306
307
    movdqa  m3, [r4] ; p2
308
    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
309
    pand    m6, m9
310
    mova    m7, m8
311
    psubb   m7, m6
312
    pand    m6, m8
313
    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
314
315
    movdqa  m4, [r0+2*r1] ; q2
316
    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
317
    pand    m6, m9
318
    pand    m8, m6
319
    psubb   m7, m6
320
    mova    m3, [r0+r1]
321
    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
322
323
    DEBLOCK_P0_Q0
324
    mova    [r4+2*r1], m1
325
    mova    [r0], m2
326 3f87f39c John Adcock
    RET
327 e27ad118 Jason Garrett-Glaser
328
;-----------------------------------------------------------------------------
329
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
330
;-----------------------------------------------------------------------------
331
INIT_MMX
332 3f87f39c John Adcock
cglobal x264_deblock_h_luma_sse2, 5,7
333
    movsxd r10, r1d
334 e27ad118 Jason Garrett-Glaser
    lea    r11, [r10+r10*2]
335 3f87f39c John Adcock
    lea    r6,  [r0-4]
336
    lea    r5,  [r0-4+r11]
337
%ifdef WIN64
338
    sub    rsp, 0x98
339
    %define pix_tmp rsp+0x30
340
%else
341 e27ad118 Jason Garrett-Glaser
    sub    rsp, 0x68
342
    %define pix_tmp rsp
343 3f87f39c John Adcock
%endif
344 e27ad118 Jason Garrett-Glaser
345
    ; transpose 6x16 -> tmp space
346 3f87f39c John Adcock
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp
347
    lea    r6, [r6+r10*8]
348
    lea    r5, [r5+r10*8]
349
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
350 e27ad118 Jason Garrett-Glaser
351
    ; vertical filter
352
    ; alpha, beta, tc0 are still in r2d, r3d, r4
353 3f87f39c John Adcock
    ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
354 e27ad118 Jason Garrett-Glaser
    lea    r0, [pix_tmp+0x30]
355 3f87f39c John Adcock
    mov    r1d, 0x10
356
%ifdef WIN64
357
    mov    [rsp+0x20], r4
358
%endif
359 e27ad118 Jason Garrett-Glaser
    call   x264_deblock_v_luma_sse2
360
361
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
362 3f87f39c John Adcock
    add    r6, 2
363
    add    r5, 2
364 e27ad118 Jason Garrett-Glaser
    movq   m0, [pix_tmp+0x18]
365
    movq   m1, [pix_tmp+0x28]
366
    movq   m2, [pix_tmp+0x38]
367
    movq   m3, [pix_tmp+0x48]
368 3f87f39c John Adcock
    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
369 e27ad118 Jason Garrett-Glaser
370
    shl    r10, 3
371 3f87f39c John Adcock
    sub    r6,  r10
372
    sub    r5,  r10
373 e27ad118 Jason Garrett-Glaser
    shr    r10, 3
374
    movq   m0, [pix_tmp+0x10]
375
    movq   m1, [pix_tmp+0x20]
376
    movq   m2, [pix_tmp+0x30]
377
    movq   m3, [pix_tmp+0x40]
378 3f87f39c John Adcock
    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
379 e27ad118 Jason Garrett-Glaser
380 3f87f39c John Adcock
%ifdef WIN64
381
    add    rsp, 0x98
382
%else
383 e27ad118 Jason Garrett-Glaser
    add    rsp, 0x68
384 3f87f39c John Adcock
%endif
385
    RET
386 e27ad118 Jason Garrett-Glaser
387
%else
388
389
%macro DEBLOCK_LUMA 3
390
;-----------------------------------------------------------------------------
391
; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
392
;-----------------------------------------------------------------------------
393
cglobal x264_deblock_%2_luma_%1, 5,5
394
    lea     r4, [r1*3]
395
    dec     r2     ; alpha-1
396
    neg     r4
397
    dec     r3     ; beta-1
398
    add     r4, r0 ; pix-3*stride
399
    %assign pad 2*%3+12-(stack_offset&15)
400
    SUB     esp, pad
401
402
    mova    m0, [r4+r1]   ; p1
403
    mova    m1, [r4+2*r1] ; p0
404
    mova    m2, [r0]      ; q0
405
    mova    m3, [r0+r1]   ; q1
406
    LOAD_MASK r2, r3
407
408 3f87f39c John Adcock
    mov     r3, r4mp
409 e27ad118 Jason Garrett-Glaser
    movd    m4, [r3] ; tc0
410
    punpcklbw m4, m4
411
    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
412
    mova   [esp+%3], m4 ; tc
413
    pcmpeqb m3, m3
414
    pcmpgtb m4, m3
415
    pand    m4, m7
416
    mova   [esp], m4 ; mask
417
418
    mova    m3, [r4] ; p2
419
    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
420
    pand    m6, m4
421
    pand    m4, [esp+%3] ; tc
422
    mova    m7, m4
423
    psubb   m7, m6
424
    pand    m6, m4
425
    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
426
427
    mova    m4, [r0+2*r1] ; q2
428
    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
429
    mova    m5, [esp] ; mask
430
    pand    m6, m5
431
    mova    m5, [esp+%3] ; tc
432
    pand    m5, m6
433
    psubb   m7, m6
434
    mova    m3, [r0+r1]
435
    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
436
437
    DEBLOCK_P0_Q0
438
    mova    [r4+2*r1], m1
439
    mova    [r0], m2
440
    ADD     esp, pad
441
    RET
442
443
;-----------------------------------------------------------------------------
444
; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
445
;-----------------------------------------------------------------------------
446
INIT_MMX
447
cglobal x264_deblock_h_luma_%1, 0,5
448 3f87f39c John Adcock
    mov    r0, r0mp
449 e27ad118 Jason Garrett-Glaser
    mov    r3, r1m
450
    lea    r4, [r3*3]
451
    sub    r0, 4
452
    lea    r1, [r0+r4]
453
    %assign pad 0x78-(stack_offset&15)
454
    SUB    esp, pad
455
%define pix_tmp esp+12
456
457
    ; transpose 6x16 -> tmp space
458
    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
459
    lea    r0, [r0+r3*8]
460
    lea    r1, [r1+r3*8]
461
    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
462
463
    ; vertical filter
464
    lea    r0, [pix_tmp+0x30]
465
    PUSH   dword r4m
466
    PUSH   dword r3m
467
    PUSH   dword r2m
468
    PUSH   dword 16
469
    PUSH   dword r0
470
    call   x264_deblock_%2_luma_%1
471
%ifidn %2, v8
472
    add    dword [esp   ], 8 ; pix_tmp+0x38
473
    add    dword [esp+16], 2 ; tc0+2
474
    call   x264_deblock_%2_luma_%1
475
%endif
476
    ADD    esp, 20
477
478
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
479 3f87f39c John Adcock
    mov    r0, r0mp
480 e27ad118 Jason Garrett-Glaser
    sub    r0, 2
481
    lea    r1, [r0+r4]
482
483
    movq   m0, [pix_tmp+0x10]
484
    movq   m1, [pix_tmp+0x20]
485
    movq   m2, [pix_tmp+0x30]
486
    movq   m3, [pix_tmp+0x40]
487
    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
488
489
    lea    r0, [r0+r3*8]
490
    lea    r1, [r1+r3*8]
491
    movq   m0, [pix_tmp+0x18]
492
    movq   m1, [pix_tmp+0x28]
493
    movq   m2, [pix_tmp+0x38]
494
    movq   m3, [pix_tmp+0x48]
495
    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
496
497
    ADD    esp, pad
498
    RET
499
%endmacro ; DEBLOCK_LUMA
500
501 2c166c3a Ronald S. Bultje
INIT_MMX
502
DEBLOCK_LUMA mmxext, v8, 8
503 e27ad118 Jason Garrett-Glaser
INIT_XMM
504
DEBLOCK_LUMA sse2, v, 16
505
506
%endif ; ARCH
507
508
509
510
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
511
    mova  t0, p2
512
    mova  t1, p0
513
    pavgb t0, p1
514
    pavgb t1, q0
515
    pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
516
    mova  t5, t1
517
    mova  t2, p2
518
    mova  t3, p0
519
    paddb t2, p1
520
    paddb t3, q0
521
    paddb t2, t3
522
    mova  t3, t2
523
    mova  t4, t2
524
    psrlw t2, 1
525 2c166c3a Ronald S. Bultje
    pavgb t2, mpb_0
526 e27ad118 Jason Garrett-Glaser
    pxor  t2, t0
527 2c166c3a Ronald S. Bultje
    pand  t2, mpb_1
528 e27ad118 Jason Garrett-Glaser
    psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
529
530
    mova  t1, p2
531
    mova  t2, p2
532
    pavgb t1, q1
533
    psubb t2, q1
534
    paddb t3, t3
535
    psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
536 2c166c3a Ronald S. Bultje
    pand  t2, mpb_1
537 e27ad118 Jason Garrett-Glaser
    psubb t1, t2
538
    pavgb t1, p1
539
    pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
540
    psrlw t3, 2
541 2c166c3a Ronald S. Bultje
    pavgb t3, mpb_0
542 e27ad118 Jason Garrett-Glaser
    pxor  t3, t1
543 2c166c3a Ronald S. Bultje
    pand  t3, mpb_1
544 e27ad118 Jason Garrett-Glaser
    psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
545
546
    mova  t3, p0
547
    mova  t2, p0
548
    pxor  t3, q1
549
    pavgb t2, q1
550 2c166c3a Ronald S. Bultje
    pand  t3, mpb_1
551 e27ad118 Jason Garrett-Glaser
    psubb t2, t3
552
    pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
553
554
    pxor  t1, t2
555
    pxor  t2, p0
556
    pand  t1, mask1p
557
    pand  t2, mask0
558
    pxor  t1, t2
559
    pxor  t1, p0
560
    mova  %1, t1 ; store p0
561
562
    mova  t1, %4 ; p3
563
    mova  t2, t1
564
    pavgb t1, p2
565
    paddb t2, p2
566
    pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
567
    paddb t2, t2
568
    paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
569
    psrlw t2, 2
570 2c166c3a Ronald S. Bultje
    pavgb t2, mpb_0
571 e27ad118 Jason Garrett-Glaser
    pxor  t2, t1
572 2c166c3a Ronald S. Bultje
    pand  t2, mpb_1
573 e27ad118 Jason Garrett-Glaser
    psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
574
575
    pxor  t0, p1
576
    pxor  t1, p2
577
    pand  t0, mask1p
578
    pand  t1, mask1p
579
    pxor  t0, p1
580
    pxor  t1, p2
581
    mova  %2, t0 ; store p1
582
    mova  %3, t1 ; store p2
583
%endmacro
584
585
%macro LUMA_INTRA_SWAP_PQ 0
586
    %define q1 m0
587
    %define q0 m1
588
    %define p0 m2
589
    %define p1 m3
590
    %define p2 q2
591
    %define mask1p mask1q
592
%endmacro
593
594
%macro DEBLOCK_LUMA_INTRA 2
595
    %define p1 m0
596
    %define p0 m1
597
    %define q0 m2
598
    %define q1 m3
599
    %define t0 m4
600
    %define t1 m5
601
    %define t2 m6
602
    %define t3 m7
603
%ifdef ARCH_X86_64
604
    %define p2 m8
605
    %define q2 m9
606
    %define t4 m10
607
    %define t5 m11
608
    %define mask0 m12
609
    %define mask1p m13
610
    %define mask1q [rsp-24]
611 2c166c3a Ronald S. Bultje
    %define mpb_0 m14
612
    %define mpb_1 m15
613 e27ad118 Jason Garrett-Glaser
%else
614
    %define spill(x) [esp+16*x+((stack_offset+4)&15)]
615
    %define p2 [r4+r1]
616
    %define q2 [r0+2*r1]
617
    %define t4 spill(0)
618
    %define t5 spill(1)
619
    %define mask0 spill(2)
620
    %define mask1p spill(3)
621
    %define mask1q spill(4)
622 2c166c3a Ronald S. Bultje
    %define mpb_0 [pb_0]
623
    %define mpb_1 [pb_1]
624 e27ad118 Jason Garrett-Glaser
%endif
625
626
;-----------------------------------------------------------------------------
627
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
628
;-----------------------------------------------------------------------------
629 3f87f39c John Adcock
cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
630 e27ad118 Jason Garrett-Glaser
%ifndef ARCH_X86_64
631
    sub     esp, 0x60
632
%endif
633
    lea     r4, [r1*4]
634
    lea     r5, [r1*3] ; 3*stride
635
    dec     r2d        ; alpha-1
636
    jl .end
637
    neg     r4
638
    dec     r3d        ; beta-1
639
    jl .end
640
    add     r4, r0     ; pix-4*stride
641
    mova    p1, [r4+2*r1]
642
    mova    p0, [r4+r5]
643
    mova    q0, [r0]
644
    mova    q1, [r0+r1]
645
%ifdef ARCH_X86_64
646 2c166c3a Ronald S. Bultje
    pxor    mpb_0, mpb_0
647
    mova    mpb_1, [pb_1]
648 e27ad118 Jason Garrett-Glaser
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
649
    SWAP    7, 12 ; m12=mask0
650 2c166c3a Ronald S. Bultje
    pavgb   t5, mpb_0
651
    pavgb   t5, mpb_1 ; alpha/4+1
652 e27ad118 Jason Garrett-Glaser
    movdqa  p2, [r4+r1]
653
    movdqa  q2, [r0+2*r1]
654
    DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
655
    DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
656
    DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
657
    pand    t0, mask0
658
    pand    t4, t0
659
    pand    t2, t0
660
    mova    mask1q, t4
661
    mova    mask1p, t2
662
%else
663
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
664
    mova    m4, t5
665
    mova    mask0, m7
666 2c166c3a Ronald S. Bultje
    pavgb   m4, [pb_0]
667
    pavgb   m4, [pb_1] ; alpha/4+1
668 e27ad118 Jason Garrett-Glaser
    DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
669
    pand    m6, mask0
670
    DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
671
    pand    m4, m6
672
    mova    mask1p, m4
673
    DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
674
    pand    m4, m6
675
    mova    mask1q, m4
676
%endif
677
    LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
678
    LUMA_INTRA_SWAP_PQ
679
    LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
680
.end:
681
%ifndef ARCH_X86_64
682
    add     esp, 0x60
683
%endif
684
    RET
685
686
INIT_MMX
687
%ifdef ARCH_X86_64
688
;-----------------------------------------------------------------------------
689
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
690
;-----------------------------------------------------------------------------
691 3f87f39c John Adcock
cglobal x264_deblock_h_luma_intra_%1, 4,7
692 e27ad118 Jason Garrett-Glaser
    movsxd r10, r1d
693
    lea    r11, [r10*3]
694 3f87f39c John Adcock
    lea    r6,  [r0-4]
695
    lea    r5,  [r0-4+r11]
696 e27ad118 Jason Garrett-Glaser
    sub    rsp, 0x88
697
    %define pix_tmp rsp
698
699
    ; transpose 8x16 -> tmp space
700 3f87f39c John Adcock
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
701
    lea    r6, [r6+r10*8]
702
    lea    r5, [r5+r10*8]
703
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
704 e27ad118 Jason Garrett-Glaser
705
    lea    r0,  [pix_tmp+0x40]
706
    mov    r1,  0x10
707
    call   x264_deblock_v_luma_intra_%1
708
709
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
710 3f87f39c John Adcock
    lea    r5, [r6+r11]
711
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
712 e27ad118 Jason Garrett-Glaser
    shl    r10, 3
713 3f87f39c John Adcock
    sub    r6,  r10
714
    sub    r5,  r10
715 e27ad118 Jason Garrett-Glaser
    shr    r10, 3
716 3f87f39c John Adcock
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
717 e27ad118 Jason Garrett-Glaser
    add    rsp, 0x88
718 3f87f39c John Adcock
    RET
719 e27ad118 Jason Garrett-Glaser
%else
720
cglobal x264_deblock_h_luma_intra_%1, 2,4
721
    lea    r3,  [r1*3]
722
    sub    r0,  4
723
    lea    r2,  [r0+r3]
724
%assign pad 0x8c-(stack_offset&15)
725
    SUB    rsp, pad
726
    %define pix_tmp rsp
727
728
    ; transpose 8x16 -> tmp space
729
    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
730
    lea    r0,  [r0+r1*8]
731
    lea    r2,  [r2+r1*8]
732
    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
733
734
    lea    r0,  [pix_tmp+0x40]
735
    PUSH   dword r3m
736
    PUSH   dword r2m
737
    PUSH   dword 16
738
    PUSH   r0
739
    call   x264_deblock_%2_luma_intra_%1
740
%ifidn %2, v8
741
    add    dword [rsp], 8 ; pix_tmp+8
742
    call   x264_deblock_%2_luma_intra_%1
743
%endif
744
    ADD    esp, 16
745
746
    mov    r1,  r1m
747 3f87f39c John Adcock
    mov    r0,  r0mp
748 e27ad118 Jason Garrett-Glaser
    lea    r3,  [r1*3]
749
    sub    r0,  4
750
    lea    r2,  [r0+r3]
751
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
752
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
753
    lea    r0,  [r0+r1*8]
754
    lea    r2,  [r2+r1*8]
755
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
756
    ADD    rsp, pad
757
    RET
758
%endif ; ARCH_X86_64
759
%endmacro ; DEBLOCK_LUMA_INTRA
760
761
INIT_XMM
762
DEBLOCK_LUMA_INTRA sse2, v
763
%ifndef ARCH_X86_64
764
INIT_MMX
765
DEBLOCK_LUMA_INTRA mmxext, v8
766
%endif
767 2c166c3a Ronald S. Bultje
768
769
770
INIT_MMX
771
772
%macro CHROMA_V_START 0
773
    dec    r2d      ; alpha-1
774
    dec    r3d      ; beta-1
775
    mov    t5, r0
776
    sub    t5, r1
777
    sub    t5, r1
778
%endmacro
779
780
%macro CHROMA_H_START 0
781
    dec    r2d
782
    dec    r3d
783
    sub    r0, 2
784
    lea    t6, [r1*3]
785
    mov    t5, r0
786
    add    r0, t6
787
%endmacro
788
789
%define t5 r5
790
%define t6 r6
791
792
;-----------------------------------------------------------------------------
793
; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
794
;-----------------------------------------------------------------------------
795
cglobal x264_deblock_v_chroma_mmxext, 5,6
796
    CHROMA_V_START
797
    movq  m0, [t5]
798
    movq  m1, [t5+r1]
799
    movq  m2, [r0]
800
    movq  m3, [r0+r1]
801
    call x264_chroma_inter_body_mmxext
802
    movq  [t5+r1], m1
803
    movq  [r0], m2
804
    RET
805
806
;-----------------------------------------------------------------------------
807
; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
808
;-----------------------------------------------------------------------------
809
cglobal x264_deblock_h_chroma_mmxext, 5,7
810
%ifdef ARCH_X86_64
811
    %define buf0 [rsp-24]
812
    %define buf1 [rsp-16]
813
%else
814
    %define buf0 r0m
815
    %define buf1 r2m
816
%endif
817
    CHROMA_H_START
818
    TRANSPOSE4x8_LOAD  PASS8ROWS(t5, r0, r1, t6)
819
    movq  buf0, m0
820
    movq  buf1, m3
821
    call x264_chroma_inter_body_mmxext
822
    movq  m0, buf0
823
    movq  m3, buf1
824
    TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
825
    RET
826
827
ALIGN 16
828
x264_chroma_inter_body_mmxext:
829
    LOAD_MASK  r2d, r3d
830
    movd       m6, [r4] ; tc0
831
    punpcklbw  m6, m6
832
    pand       m7, m6
833
    DEBLOCK_P0_Q0
834
    ret
835
836
837
838
; in: %1=p0 %2=p1 %3=q1
839
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
840
%macro CHROMA_INTRA_P0 3
841
    movq    m4, %1
842
    pxor    m4, %3
843
    pand    m4, [pb_1] ; m4 = (p0^q1)&1
844
    pavgb   %1, %3
845
    psubusb %1, m4
846
    pavgb   %1, %2             ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
847
%endmacro
848
849
%define t5 r4
850
%define t6 r5
851
852
;-----------------------------------------------------------------------------
853
; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
854
;-----------------------------------------------------------------------------
855
cglobal x264_deblock_v_chroma_intra_mmxext, 4,5
856
    CHROMA_V_START
857
    movq  m0, [t5]
858
    movq  m1, [t5+r1]
859
    movq  m2, [r0]
860
    movq  m3, [r0+r1]
861
    call x264_chroma_intra_body_mmxext
862
    movq  [t5+r1], m1
863
    movq  [r0], m2
864
    RET
865
866
;-----------------------------------------------------------------------------
867
; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
868
;-----------------------------------------------------------------------------
869
cglobal x264_deblock_h_chroma_intra_mmxext, 4,6
870
    CHROMA_H_START
871
    TRANSPOSE4x8_LOAD  PASS8ROWS(t5, r0, r1, t6)
872
    call x264_chroma_intra_body_mmxext
873
    TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
874
    RET
875
876
ALIGN 16
877
x264_chroma_intra_body_mmxext:
878
    LOAD_MASK r2d, r3d
879
    movq   m5, m1
880
    movq   m6, m2
881
    CHROMA_INTRA_P0  m1, m0, m3
882
    CHROMA_INTRA_P0  m2, m3, m0
883
    psubb  m1, m5
884
    psubb  m2, m6
885
    pand   m1, m7
886
    pand   m2, m7
887
    paddb  m1, m5
888
    paddb  m2, m6
889
    ret