Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_deblock.asm @ 888fa31e

History | View | Annotate | Download (22.7 KB)

1
;*****************************************************************************
2
;* MMX/SSE2/AVX-optimized H.264 deblocking code
3
;*****************************************************************************
4
;* Copyright (C) 2005-2011 x264 project
5
;*
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
8
;*          Oskar Arvidsson <oskar@irock.se>
9
;*
10
;* This file is part of Libav.
11
;*
12
;* Libav is free software; you can redistribute it and/or
13
;* modify it under the terms of the GNU Lesser General Public
14
;* License as published by the Free Software Foundation; either
15
;* version 2.1 of the License, or (at your option) any later version.
16
;*
17
;* Libav is distributed in the hope that it will be useful,
18
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
;* Lesser General Public License for more details.
21
;*
22
;* You should have received a copy of the GNU Lesser General Public
23
;* License along with Libav; if not, write to the Free Software
24
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
;******************************************************************************
26

    
27
%include "x86inc.asm"
28
%include "x86util.asm"
29

    
30
SECTION .text
31

    
32
cextern pb_0
33
cextern pb_1
34
cextern pb_3
35
cextern pb_A1
36

    
37
; expands to [base],...,[base+7*stride]
38
%define PASS8ROWS(base, base3, stride, stride3) \
39
    [base], [base+stride], [base+stride*2], [base3], \
40
    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
41

    
42
%define PASS8ROWS(base, base3, stride, stride3, offset) \
43
    PASS8ROWS(base+offset, base3+offset, stride, stride3)
44

    
45
; in: 8 rows of 4 bytes in %4..%11
46
; out: 4 rows of 8 bytes in m0..m3
47
%macro TRANSPOSE4x8_LOAD 11
48
    movh       m0, %4
49
    movh       m2, %5
50
    movh       m1, %6
51
    movh       m3, %7
52
    punpckl%1  m0, m2
53
    punpckl%1  m1, m3
54
    mova       m2, m0
55
    punpckl%2  m0, m1
56
    punpckh%2  m2, m1
57

    
58
    movh       m4, %8
59
    movh       m6, %9
60
    movh       m5, %10
61
    movh       m7, %11
62
    punpckl%1  m4, m6
63
    punpckl%1  m5, m7
64
    mova       m6, m4
65
    punpckl%2  m4, m5
66
    punpckh%2  m6, m5
67

    
68
    punpckh%3  m1, m0, m4
69
    punpckh%3  m3, m2, m6
70
    punpckl%3  m0, m4
71
    punpckl%3  m2, m6
72
%endmacro
73

    
74
; in: 4 rows of 8 bytes in m0..m3
75
; out: 8 rows of 4 bytes in %1..%8
76
%macro TRANSPOSE8x4B_STORE 8
77
    punpckhdq  m4, m0, m0
78
    punpckhdq  m5, m1, m1
79
    punpckhdq  m6, m2, m2
80

    
81
    punpcklbw  m0, m1
82
    punpcklbw  m2, m3
83
    punpcklwd  m1, m0, m2
84
    punpckhwd  m0, m2
85
    movh       %1, m1
86
    punpckhdq  m1, m1
87
    movh       %2, m1
88
    movh       %3, m0
89
    punpckhdq  m0, m0
90
    movh       %4, m0
91

    
92
    punpckhdq  m3, m3
93
    punpcklbw  m4, m5
94
    punpcklbw  m6, m3
95
    punpcklwd  m5, m4, m6
96
    punpckhwd  m4, m6
97
    movh       %5, m5
98
    punpckhdq  m5, m5
99
    movh       %6, m5
100
    movh       %7, m4
101
    punpckhdq  m4, m4
102
    movh       %8, m4
103
%endmacro
104

    
105
%macro TRANSPOSE4x8B_LOAD 8
106
    TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
107
%endmacro
108

    
109
%macro SBUTTERFLY3 4
110
    punpckh%1  %4, %2, %3
111
    punpckl%1  %2, %3
112
%endmacro
113

    
114
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
115
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
116
%macro TRANSPOSE6x8_MEM 9
117
    RESET_MM_PERMUTATION
118
    movq  m0, %1
119
    movq  m1, %2
120
    movq  m2, %3
121
    movq  m3, %4
122
    movq  m4, %5
123
    movq  m5, %6
124
    movq  m6, %7
125
    SBUTTERFLY bw, 0, 1, 7
126
    SBUTTERFLY bw, 2, 3, 7
127
    SBUTTERFLY bw, 4, 5, 7
128
    movq  [%9+0x10], m3
129
    SBUTTERFLY3 bw, m6, %8, m7
130
    SBUTTERFLY wd, 0, 2, 3
131
    SBUTTERFLY wd, 4, 6, 3
132
    punpckhdq m0, m4
133
    movq  [%9+0x00], m0
134
    SBUTTERFLY3 wd, m1, [%9+0x10], m3
135
    SBUTTERFLY wd, 5, 7, 0
136
    SBUTTERFLY dq, 1, 5, 0
137
    SBUTTERFLY dq, 2, 6, 0
138
    punpckldq m3, m7
139
    movq  [%9+0x10], m2
140
    movq  [%9+0x20], m6
141
    movq  [%9+0x30], m1
142
    movq  [%9+0x40], m5
143
    movq  [%9+0x50], m3
144
    RESET_MM_PERMUTATION
145
%endmacro
146

    
147
; in: 8 rows of 8 in %1..%8
148
; out: 8 rows of 8 in %9..%16
149
%macro TRANSPOSE8x8_MEM 16
150
    RESET_MM_PERMUTATION
151
    movq  m0, %1
152
    movq  m1, %2
153
    movq  m2, %3
154
    movq  m3, %4
155
    movq  m4, %5
156
    movq  m5, %6
157
    movq  m6, %7
158
    SBUTTERFLY bw, 0, 1, 7
159
    SBUTTERFLY bw, 2, 3, 7
160
    SBUTTERFLY bw, 4, 5, 7
161
    SBUTTERFLY3 bw, m6, %8, m7
162
    movq  %9,  m5
163
    SBUTTERFLY wd, 0, 2, 5
164
    SBUTTERFLY wd, 4, 6, 5
165
    SBUTTERFLY wd, 1, 3, 5
166
    movq  %11, m6
167
    movq  m6,  %9
168
    SBUTTERFLY wd, 6, 7, 5
169
    SBUTTERFLY dq, 0, 4, 5
170
    SBUTTERFLY dq, 1, 6, 5
171
    movq  %9,  m0
172
    movq  %10, m4
173
    movq  %13, m1
174
    movq  %14, m6
175
    SBUTTERFLY3 dq, m2, %11, m0
176
    SBUTTERFLY dq, 3, 7, 4
177
    movq  %11, m2
178
    movq  %12, m0
179
    movq  %15, m3
180
    movq  %16, m7
181
    RESET_MM_PERMUTATION
182
%endmacro
183

    
184
; out: %4 = |%1-%2|>%3
185
; clobbers: %5
186
%macro DIFF_GT 5
187
%if avx_enabled == 0
188
    mova    %5, %2
189
    mova    %4, %1
190
    psubusb %5, %1
191
    psubusb %4, %2
192
%else
193
    psubusb %5, %2, %1
194
    psubusb %4, %1, %2
195
%endif
196
    por     %4, %5
197
    psubusb %4, %3
198
%endmacro
199

    
200
; out: %4 = |%1-%2|>%3
201
; clobbers: %5
202
%macro DIFF_GT2 5
203
%ifdef ARCH_X86_64
204
    psubusb %5, %2, %1
205
    psubusb %4, %1, %2
206
%else
207
    mova    %5, %2
208
    mova    %4, %1
209
    psubusb %5, %1
210
    psubusb %4, %2
211
%endif
212
    psubusb %5, %3
213
    psubusb %4, %3
214
    pcmpeqb %4, %5
215
%endmacro
216

    
217
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
218
; out: m5=beta-1, m7=mask, %3=alpha-1
219
; clobbers: m4,m6
220
%macro LOAD_MASK 2-3
221
    movd     m4, %1
222
    movd     m5, %2
223
    SPLATW   m4, m4
224
    SPLATW   m5, m5
225
    packuswb m4, m4  ; 16x alpha-1
226
    packuswb m5, m5  ; 16x beta-1
227
%if %0>2
228
    mova     %3, m4
229
%endif
230
    DIFF_GT  m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
231
    DIFF_GT  m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
232
    por      m7, m4
233
    DIFF_GT  m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
234
    por      m7, m4
235
    pxor     m6, m6
236
    pcmpeqb  m7, m6
237
%endmacro
238

    
239
; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
240
; out: m1=p0' m2=q0'
241
; clobbers: m0,3-6
242
%macro DEBLOCK_P0_Q0 0
243
    pxor    m5, m1, m2   ; p0^q0
244
    pand    m5, [pb_1]   ; (p0^q0)&1
245
    pcmpeqb m4, m4
246
    pxor    m3, m4
247
    pavgb   m3, m0       ; (p1 - q1 + 256)>>1
248
    pavgb   m3, [pb_3]   ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
249
    pxor    m4, m1
250
    pavgb   m4, m2       ; (q0 - p0 + 256)>>1
251
    pavgb   m3, m5
252
    paddusb m3, m4       ; d+128+33
253
    mova    m6, [pb_A1]
254
    psubusb m6, m3
255
    psubusb m3, [pb_A1]
256
    pminub  m6, m7
257
    pminub  m3, m7
258
    psubusb m1, m6
259
    psubusb m2, m3
260
    paddusb m1, m3
261
    paddusb m2, m6
262
%endmacro
263

    
264
; in: m1=p0 m2=q0
265
;     %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
266
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
267
; clobbers: q2, tmp, tc0
268
%macro LUMA_Q1 6
269
    pavgb   %6, m1, m2
270
    pavgb   %2, %6       ; avg(p2,avg(p0,q0))
271
    pxor    %6, %3
272
    pand    %6, [pb_1]   ; (p2^avg(p0,q0))&1
273
    psubusb %2, %6       ; (p2+((p0+q0+1)>>1))>>1
274
    psubusb %6, %1, %5
275
    paddusb %5, %1
276
    pmaxub  %2, %6
277
    pminub  %2, %5
278
    mova    %4, %2
279
%endmacro
280

    
281
%ifdef ARCH_X86_64
282
;-----------------------------------------------------------------------------
283
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
284
;-----------------------------------------------------------------------------
285
%macro DEBLOCK_LUMA 1
286
cglobal deblock_v_luma_8_%1, 5,5,10
287
    movd    m8, [r4] ; tc0
288
    lea     r4, [r1*3]
289
    dec     r2d        ; alpha-1
290
    neg     r4
291
    dec     r3d        ; beta-1
292
    add     r4, r0     ; pix-3*stride
293

    
294
    mova    m0, [r4+r1]   ; p1
295
    mova    m1, [r4+2*r1] ; p0
296
    mova    m2, [r0]      ; q0
297
    mova    m3, [r0+r1]   ; q1
298
    LOAD_MASK r2d, r3d
299

    
300
    punpcklbw m8, m8
301
    punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
302
    pcmpeqb m9, m9
303
    pcmpeqb m9, m8
304
    pandn   m9, m7
305
    pand    m8, m9
306

    
307
    movdqa  m3, [r4] ; p2
308
    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
309
    pand    m6, m9
310
    psubb   m7, m8, m6
311
    pand    m6, m8
312
    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
313

    
314
    movdqa  m4, [r0+2*r1] ; q2
315
    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
316
    pand    m6, m9
317
    pand    m8, m6
318
    psubb   m7, m6
319
    mova    m3, [r0+r1]
320
    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
321

    
322
    DEBLOCK_P0_Q0
323
    mova    [r4+2*r1], m1
324
    mova    [r0], m2
325
    RET
326

    
327
;-----------------------------------------------------------------------------
328
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
329
;-----------------------------------------------------------------------------
330
INIT_MMX
331
cglobal deblock_h_luma_8_%1, 5,7
332
    movsxd r10, r1d
333
    lea    r11, [r10+r10*2]
334
    lea    r6,  [r0-4]
335
    lea    r5,  [r0-4+r11]
336
%ifdef WIN64
337
    sub    rsp, 0x98
338
    %define pix_tmp rsp+0x30
339
%else
340
    sub    rsp, 0x68
341
    %define pix_tmp rsp
342
%endif
343

    
344
    ; transpose 6x16 -> tmp space
345
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp
346
    lea    r6, [r6+r10*8]
347
    lea    r5, [r5+r10*8]
348
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
349

    
350
    ; vertical filter
351
    ; alpha, beta, tc0 are still in r2d, r3d, r4
352
    ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
353
    lea    r0, [pix_tmp+0x30]
354
    mov    r1d, 0x10
355
%ifdef WIN64
356
    mov    [rsp+0x20], r4
357
%endif
358
    call   deblock_v_luma_8_%1
359

    
360
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
361
    add    r6, 2
362
    add    r5, 2
363
    movq   m0, [pix_tmp+0x18]
364
    movq   m1, [pix_tmp+0x28]
365
    movq   m2, [pix_tmp+0x38]
366
    movq   m3, [pix_tmp+0x48]
367
    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r10, r11)
368

    
369
    shl    r10, 3
370
    sub    r6,  r10
371
    sub    r5,  r10
372
    shr    r10, 3
373
    movq   m0, [pix_tmp+0x10]
374
    movq   m1, [pix_tmp+0x20]
375
    movq   m2, [pix_tmp+0x30]
376
    movq   m3, [pix_tmp+0x40]
377
    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r10, r11)
378

    
379
%ifdef WIN64
380
    add    rsp, 0x98
381
%else
382
    add    rsp, 0x68
383
%endif
384
    RET
385
%endmacro
386

    
387
INIT_XMM
388
DEBLOCK_LUMA sse2
389
INIT_AVX
390
DEBLOCK_LUMA avx
391

    
392
%else
393

    
394
%macro DEBLOCK_LUMA 3
395
;-----------------------------------------------------------------------------
396
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
397
;-----------------------------------------------------------------------------
398
cglobal deblock_%2_luma_8_%1, 5,5
399
    lea     r4, [r1*3]
400
    dec     r2     ; alpha-1
401
    neg     r4
402
    dec     r3     ; beta-1
403
    add     r4, r0 ; pix-3*stride
404
    %assign pad 2*%3+12-(stack_offset&15)
405
    SUB     esp, pad
406

    
407
    mova    m0, [r4+r1]   ; p1
408
    mova    m1, [r4+2*r1] ; p0
409
    mova    m2, [r0]      ; q0
410
    mova    m3, [r0+r1]   ; q1
411
    LOAD_MASK r2, r3
412

    
413
    mov     r3, r4mp
414
    movd    m4, [r3] ; tc0
415
    punpcklbw m4, m4
416
    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
417
    mova   [esp+%3], m4 ; tc
418
    pcmpeqb m3, m3
419
    pcmpgtb m4, m3
420
    pand    m4, m7
421
    mova   [esp], m4 ; mask
422

    
423
    mova    m3, [r4] ; p2
424
    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
425
    pand    m6, m4
426
    pand    m4, [esp+%3] ; tc
427
    psubb   m7, m4, m6
428
    pand    m6, m4
429
    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
430

    
431
    mova    m4, [r0+2*r1] ; q2
432
    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
433
    mova    m5, [esp] ; mask
434
    pand    m6, m5
435
    mova    m5, [esp+%3] ; tc
436
    pand    m5, m6
437
    psubb   m7, m6
438
    mova    m3, [r0+r1]
439
    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
440

    
441
    DEBLOCK_P0_Q0
442
    mova    [r4+2*r1], m1
443
    mova    [r0], m2
444
    ADD     esp, pad
445
    RET
446

    
447
;-----------------------------------------------------------------------------
448
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
449
;-----------------------------------------------------------------------------
450
INIT_MMX
451
cglobal deblock_h_luma_8_%1, 0,5
452
    mov    r0, r0mp
453
    mov    r3, r1m
454
    lea    r4, [r3*3]
455
    sub    r0, 4
456
    lea    r1, [r0+r4]
457
    %assign pad 0x78-(stack_offset&15)
458
    SUB    esp, pad
459
%define pix_tmp esp+12
460

    
461
    ; transpose 6x16 -> tmp space
462
    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
463
    lea    r0, [r0+r3*8]
464
    lea    r1, [r1+r3*8]
465
    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
466

    
467
    ; vertical filter
468
    lea    r0, [pix_tmp+0x30]
469
    PUSH   dword r4m
470
    PUSH   dword r3m
471
    PUSH   dword r2m
472
    PUSH   dword 16
473
    PUSH   dword r0
474
    call   deblock_%2_luma_8_%1
475
%ifidn %2, v8
476
    add    dword [esp   ], 8 ; pix_tmp+0x38
477
    add    dword [esp+16], 2 ; tc0+2
478
    call   deblock_%2_luma_8_%1
479
%endif
480
    ADD    esp, 20
481

    
482
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
483
    mov    r0, r0mp
484
    sub    r0, 2
485
    lea    r1, [r0+r4]
486

    
487
    movq   m0, [pix_tmp+0x10]
488
    movq   m1, [pix_tmp+0x20]
489
    movq   m2, [pix_tmp+0x30]
490
    movq   m3, [pix_tmp+0x40]
491
    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
492

    
493
    lea    r0, [r0+r3*8]
494
    lea    r1, [r1+r3*8]
495
    movq   m0, [pix_tmp+0x18]
496
    movq   m1, [pix_tmp+0x28]
497
    movq   m2, [pix_tmp+0x38]
498
    movq   m3, [pix_tmp+0x48]
499
    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
500

    
501
    ADD    esp, pad
502
    RET
503
%endmacro ; DEBLOCK_LUMA
504

    
505
INIT_MMX
506
DEBLOCK_LUMA mmxext, v8, 8
507
INIT_XMM
508
DEBLOCK_LUMA sse2, v, 16
509
INIT_AVX
510
DEBLOCK_LUMA avx, v, 16
511

    
512
%endif ; ARCH
513

    
514

    
515

    
516
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
517
%ifdef ARCH_X86_64
518
    pavgb t0, p2, p1
519
    pavgb t1, p0, q0
520
%else
521
    mova  t0, p2
522
    mova  t1, p0
523
    pavgb t0, p1
524
    pavgb t1, q0
525
%endif
526
    pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
527
    mova  t5, t1
528
%ifdef ARCH_X86_64
529
    paddb t2, p2, p1
530
    paddb t3, p0, q0
531
%else
532
    mova  t2, p2
533
    mova  t3, p0
534
    paddb t2, p1
535
    paddb t3, q0
536
%endif
537
    paddb t2, t3
538
    mova  t3, t2
539
    mova  t4, t2
540
    psrlw t2, 1
541
    pavgb t2, mpb_0
542
    pxor  t2, t0
543
    pand  t2, mpb_1
544
    psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
545

    
546
%ifdef ARCH_X86_64
547
    pavgb t1, p2, q1
548
    psubb t2, p2, q1
549
%else
550
    mova  t1, p2
551
    mova  t2, p2
552
    pavgb t1, q1
553
    psubb t2, q1
554
%endif
555
    paddb t3, t3
556
    psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
557
    pand  t2, mpb_1
558
    psubb t1, t2
559
    pavgb t1, p1
560
    pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
561
    psrlw t3, 2
562
    pavgb t3, mpb_0
563
    pxor  t3, t1
564
    pand  t3, mpb_1
565
    psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
566

    
567
    pxor  t3, p0, q1
568
    pavgb t2, p0, q1
569
    pand  t3, mpb_1
570
    psubb t2, t3
571
    pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
572

    
573
    pxor  t1, t2
574
    pxor  t2, p0
575
    pand  t1, mask1p
576
    pand  t2, mask0
577
    pxor  t1, t2
578
    pxor  t1, p0
579
    mova  %1, t1 ; store p0
580

    
581
    mova  t1, %4 ; p3
582
    paddb t2, t1, p2
583
    pavgb t1, p2
584
    pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
585
    paddb t2, t2
586
    paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
587
    psrlw t2, 2
588
    pavgb t2, mpb_0
589
    pxor  t2, t1
590
    pand  t2, mpb_1
591
    psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
592

    
593
    pxor  t0, p1
594
    pxor  t1, p2
595
    pand  t0, mask1p
596
    pand  t1, mask1p
597
    pxor  t0, p1
598
    pxor  t1, p2
599
    mova  %2, t0 ; store p1
600
    mova  %3, t1 ; store p2
601
%endmacro
602

    
603
%macro LUMA_INTRA_SWAP_PQ 0
604
    %define q1 m0
605
    %define q0 m1
606
    %define p0 m2
607
    %define p1 m3
608
    %define p2 q2
609
    %define mask1p mask1q
610
%endmacro
611

    
612
%macro DEBLOCK_LUMA_INTRA 2
613
    %define p1 m0
614
    %define p0 m1
615
    %define q0 m2
616
    %define q1 m3
617
    %define t0 m4
618
    %define t1 m5
619
    %define t2 m6
620
    %define t3 m7
621
%ifdef ARCH_X86_64
622
    %define p2 m8
623
    %define q2 m9
624
    %define t4 m10
625
    %define t5 m11
626
    %define mask0 m12
627
    %define mask1p m13
628
    %define mask1q [rsp-24]
629
    %define mpb_0 m14
630
    %define mpb_1 m15
631
%else
632
    %define spill(x) [esp+16*x+((stack_offset+4)&15)]
633
    %define p2 [r4+r1]
634
    %define q2 [r0+2*r1]
635
    %define t4 spill(0)
636
    %define t5 spill(1)
637
    %define mask0 spill(2)
638
    %define mask1p spill(3)
639
    %define mask1q spill(4)
640
    %define mpb_0 [pb_0]
641
    %define mpb_1 [pb_1]
642
%endif
643

    
644
;-----------------------------------------------------------------------------
645
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
646
;-----------------------------------------------------------------------------
647
cglobal deblock_%2_luma_intra_8_%1, 4,6,16
648
%ifndef ARCH_X86_64
649
    sub     esp, 0x60
650
%endif
651
    lea     r4, [r1*4]
652
    lea     r5, [r1*3] ; 3*stride
653
    dec     r2d        ; alpha-1
654
    jl .end
655
    neg     r4
656
    dec     r3d        ; beta-1
657
    jl .end
658
    add     r4, r0     ; pix-4*stride
659
    mova    p1, [r4+2*r1]
660
    mova    p0, [r4+r5]
661
    mova    q0, [r0]
662
    mova    q1, [r0+r1]
663
%ifdef ARCH_X86_64
664
    pxor    mpb_0, mpb_0
665
    mova    mpb_1, [pb_1]
666
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
667
    SWAP    7, 12 ; m12=mask0
668
    pavgb   t5, mpb_0
669
    pavgb   t5, mpb_1 ; alpha/4+1
670
    movdqa  p2, [r4+r1]
671
    movdqa  q2, [r0+2*r1]
672
    DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
673
    DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
674
    DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
675
    pand    t0, mask0
676
    pand    t4, t0
677
    pand    t2, t0
678
    mova    mask1q, t4
679
    mova    mask1p, t2
680
%else
681
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
682
    mova    m4, t5
683
    mova    mask0, m7
684
    pavgb   m4, [pb_0]
685
    pavgb   m4, [pb_1] ; alpha/4+1
686
    DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
687
    pand    m6, mask0
688
    DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
689
    pand    m4, m6
690
    mova    mask1p, m4
691
    DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
692
    pand    m4, m6
693
    mova    mask1q, m4
694
%endif
695
    LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
696
    LUMA_INTRA_SWAP_PQ
697
    LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
698
.end:
699
%ifndef ARCH_X86_64
700
    add     esp, 0x60
701
%endif
702
    RET
703

    
704
INIT_MMX
705
%ifdef ARCH_X86_64
706
;-----------------------------------------------------------------------------
707
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
708
;-----------------------------------------------------------------------------
709
cglobal deblock_h_luma_intra_8_%1, 4,7
710
    movsxd r10, r1d
711
    lea    r11, [r10*3]
712
    lea    r6,  [r0-4]
713
    lea    r5,  [r0-4+r11]
714
    sub    rsp, 0x88
715
    %define pix_tmp rsp
716

    
717
    ; transpose 8x16 -> tmp space
718
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
719
    lea    r6, [r6+r10*8]
720
    lea    r5, [r5+r10*8]
721
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
722

    
723
    lea    r0,  [pix_tmp+0x40]
724
    mov    r1,  0x10
725
    call   deblock_v_luma_intra_8_%1
726

    
727
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
728
    lea    r5, [r6+r11]
729
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
730
    shl    r10, 3
731
    sub    r6,  r10
732
    sub    r5,  r10
733
    shr    r10, 3
734
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
735
    add    rsp, 0x88
736
    RET
737
%else
738
cglobal deblock_h_luma_intra_8_%1, 2,4
739
    lea    r3,  [r1*3]
740
    sub    r0,  4
741
    lea    r2,  [r0+r3]
742
%assign pad 0x8c-(stack_offset&15)
743
    SUB    rsp, pad
744
    %define pix_tmp rsp
745

    
746
    ; transpose 8x16 -> tmp space
747
    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
748
    lea    r0,  [r0+r1*8]
749
    lea    r2,  [r2+r1*8]
750
    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
751

    
752
    lea    r0,  [pix_tmp+0x40]
753
    PUSH   dword r3m
754
    PUSH   dword r2m
755
    PUSH   dword 16
756
    PUSH   r0
757
    call   deblock_%2_luma_intra_8_%1
758
%ifidn %2, v8
759
    add    dword [rsp], 8 ; pix_tmp+8
760
    call   deblock_%2_luma_intra_8_%1
761
%endif
762
    ADD    esp, 16
763

    
764
    mov    r1,  r1m
765
    mov    r0,  r0mp
766
    lea    r3,  [r1*3]
767
    sub    r0,  4
768
    lea    r2,  [r0+r3]
769
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
770
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
771
    lea    r0,  [r0+r1*8]
772
    lea    r2,  [r2+r1*8]
773
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
774
    ADD    rsp, pad
775
    RET
776
%endif ; ARCH_X86_64
777
%endmacro ; DEBLOCK_LUMA_INTRA
778

    
779
INIT_XMM
780
DEBLOCK_LUMA_INTRA sse2, v
781
INIT_AVX
782
DEBLOCK_LUMA_INTRA avx , v
783
%ifndef ARCH_X86_64
784
INIT_MMX
785
DEBLOCK_LUMA_INTRA mmxext, v8
786
%endif
787

    
788
INIT_MMX
789

    
790
%macro CHROMA_V_START 0
791
    dec    r2d      ; alpha-1
792
    dec    r3d      ; beta-1
793
    mov    t5, r0
794
    sub    t5, r1
795
    sub    t5, r1
796
%endmacro
797

    
798
%macro CHROMA_H_START 0
799
    dec    r2d
800
    dec    r3d
801
    sub    r0, 2
802
    lea    t6, [r1*3]
803
    mov    t5, r0
804
    add    r0, t6
805
%endmacro
806

    
807
%define t5 r5
808
%define t6 r6
809

    
810
;-----------------------------------------------------------------------------
811
; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
812
;-----------------------------------------------------------------------------
813
cglobal deblock_v_chroma_8_mmxext, 5,6
814
    CHROMA_V_START
815
    movq  m0, [t5]
816
    movq  m1, [t5+r1]
817
    movq  m2, [r0]
818
    movq  m3, [r0+r1]
819
    call ff_chroma_inter_body_mmxext
820
    movq  [t5+r1], m1
821
    movq  [r0], m2
822
    RET
823

    
824
;-----------------------------------------------------------------------------
825
; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
826
;-----------------------------------------------------------------------------
827
cglobal deblock_h_chroma_8_mmxext, 5,7
828
%ifdef ARCH_X86_64
829
    %define buf0 [rsp-24]
830
    %define buf1 [rsp-16]
831
%else
832
    %define buf0 r0m
833
    %define buf1 r2m
834
%endif
835
    CHROMA_H_START
836
    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
837
    movq  buf0, m0
838
    movq  buf1, m3
839
    call ff_chroma_inter_body_mmxext
840
    movq  m0, buf0
841
    movq  m3, buf1
842
    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
843
    RET
844

    
845
ALIGN 16
846
ff_chroma_inter_body_mmxext:
847
    LOAD_MASK  r2d, r3d
848
    movd       m6, [r4] ; tc0
849
    punpcklbw  m6, m6
850
    pand       m7, m6
851
    DEBLOCK_P0_Q0
852
    ret
853

    
854

    
855

    
856
; in: %1=p0 %2=p1 %3=q1
857
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
858
%macro CHROMA_INTRA_P0 3
859
    movq    m4, %1
860
    pxor    m4, %3
861
    pand    m4, [pb_1] ; m4 = (p0^q1)&1
862
    pavgb   %1, %3
863
    psubusb %1, m4
864
    pavgb   %1, %2             ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
865
%endmacro
866

    
867
%define t5 r4
868
%define t6 r5
869

    
870
;-----------------------------------------------------------------------------
871
; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
872
;-----------------------------------------------------------------------------
873
cglobal deblock_v_chroma_intra_8_mmxext, 4,5
874
    CHROMA_V_START
875
    movq  m0, [t5]
876
    movq  m1, [t5+r1]
877
    movq  m2, [r0]
878
    movq  m3, [r0+r1]
879
    call ff_chroma_intra_body_mmxext
880
    movq  [t5+r1], m1
881
    movq  [r0], m2
882
    RET
883

    
884
;-----------------------------------------------------------------------------
885
; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
886
;-----------------------------------------------------------------------------
887
cglobal deblock_h_chroma_intra_8_mmxext, 4,6
888
    CHROMA_H_START
889
    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
890
    call ff_chroma_intra_body_mmxext
891
    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
892
    RET
893

    
894
ALIGN 16
895
ff_chroma_intra_body_mmxext:
896
    LOAD_MASK r2d, r3d
897
    movq   m5, m1
898
    movq   m6, m2
899
    CHROMA_INTRA_P0  m1, m0, m3
900
    CHROMA_INTRA_P0  m2, m3, m0
901
    psubb  m1, m5
902
    psubb  m2, m6
903
    pand   m1, m7
904
    pand   m2, m7
905
    paddb  m1, m5
906
    paddb  m2, m6
907
    ret