Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_deblock_10bit.asm @ 5705b020

History | View | Annotate | Download (22.2 KB)

1
;*****************************************************************************
2
;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
3
;*****************************************************************************
4
;* Copyright (C) 2005-2011 x264 project
5
;*
6
;* Authors: Oskar Arvidsson <oskar@irock.se>
7
;*          Loren Merritt <lorenm@u.washington.edu>
8
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
9
;*
10
;* This file is part of Libav.
11
;*
12
;* Libav is free software; you can redistribute it and/or
13
;* modify it under the terms of the GNU Lesser General Public
14
;* License as published by the Free Software Foundation; either
15
;* version 2.1 of the License, or (at your option) any later version.
16
;*
17
;* Libav is distributed in the hope that it will be useful,
18
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
;* Lesser General Public License for more details.
21
;*
22
;* You should have received a copy of the GNU Lesser General Public
23
;* License along with Libav; if not, write to the Free Software
24
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
;******************************************************************************
26

    
27
%include "x86inc.asm"
28
%include "x86util.asm"
29

    
30
SECTION_RODATA
31

    
32
pw_pixel_max: times 8 dw ((1 << 10)-1)
33

    
34
SECTION .text
35

    
36
cextern pw_2
37
cextern pw_3
38
cextern pw_4
39

    
40
; out: %4 = |%1-%2|-%3
41
; clobbers: %5
42
%macro ABS_SUB 5
43
    psubusw %5, %2, %1
44
    psubusw %4, %1, %2
45
    por     %4, %5
46
    psubw   %4, %3
47
%endmacro
48

    
49
; out: %4 = |%1-%2|<%3
50
%macro DIFF_LT   5
51
    psubusw %4, %2, %1
52
    psubusw %5, %1, %2
53
    por     %5, %4 ; |%1-%2|
54
    pxor    %4, %4
55
    psubw   %5, %3 ; |%1-%2|-%3
56
    pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
57
%endmacro
58

    
59
%macro LOAD_AB 4
60
    movd       %1, %3
61
    movd       %2, %4
62
    SPLATW     %1, %1
63
    SPLATW     %2, %2
64
%endmacro
65

    
66
; in:  %2=tc reg
67
; out: %1=splatted tc
68
%macro LOAD_TC 2
69
    movd        %1, [%2]
70
    punpcklbw   %1, %1
71
%if mmsize == 8
72
    pshufw      %1, %1, 0
73
%else
74
    pshuflw     %1, %1, 01010000b
75
    pshufd      %1, %1, 01010000b
76
%endif
77
    psraw       %1, 6
78
%endmacro
79

    
80
; in: %1=p1, %2=p0, %3=q0, %4=q1
81
;     %5=alpha, %6=beta, %7-%9=tmp
82
; out: %7=mask
83
%macro LOAD_MASK 9
84
    ABS_SUB     %2, %3, %5, %8, %7 ; |p0-q0| - alpha
85
    ABS_SUB     %1, %2, %6, %9, %7 ; |p1-p0| - beta
86
    pand        %8, %9
87
    ABS_SUB     %3, %4, %6, %9, %7 ; |q1-q0| - beta
88
    pxor        %7, %7
89
    pand        %8, %9
90
    pcmpgtw     %7, %8
91
%endmacro
92

    
93
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
94
; out: %1=p0', m2=q0'
95
%macro DEBLOCK_P0_Q0 7
96
    psubw   %3, %4
97
    pxor    %7, %7
98
    paddw   %3, [pw_4]
99
    psubw   %7, %5
100
    psubw   %6, %2, %1
101
    psllw   %6, 2
102
    paddw   %3, %6
103
    psraw   %3, 3
104
    mova    %6, [pw_pixel_max]
105
    CLIPW   %3, %7, %5
106
    pxor    %7, %7
107
    paddw   %1, %3
108
    psubw   %2, %3
109
    CLIPW   %1, %7, %6
110
    CLIPW   %2, %7, %6
111
%endmacro
112

    
113
; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
114
%macro LUMA_Q1 6
115
    pavgw       %6, %3, %4      ; (p0+q0+1)>>1
116
    paddw       %1, %6
117
    pxor        %6, %6
118
    psraw       %1, 1
119
    psubw       %6, %5
120
    psubw       %1, %2
121
    CLIPW       %1, %6, %5
122
    paddw       %1, %2
123
%endmacro
124

    
125
%macro LUMA_DEBLOCK_ONE 3
126
    DIFF_LT     m5, %1, bm, m4, m6
127
    pxor        m6, m6
128
    mova        %3, m4
129
    pcmpgtw     m6, tcm
130
    pand        m4, tcm
131
    pandn       m6, m7
132
    pand        m4, m6
133
    LUMA_Q1 m5, %2, m1, m2, m4, m6
134
%endmacro
135

    
136
%macro LUMA_H_STORE 2
137
%if mmsize == 8
138
    movq        [r0-4], m0
139
    movq        [r0+r1-4], m1
140
    movq        [r0+r1*2-4], m2
141
    movq        [r0+%2-4], m3
142
%else
143
    movq        [r0-4], m0
144
    movhps      [r0+r1-4], m0
145
    movq        [r0+r1*2-4], m1
146
    movhps      [%1-4], m1
147
    movq        [%1+r1-4], m2
148
    movhps      [%1+r1*2-4], m2
149
    movq        [%1+%2-4], m3
150
    movhps      [%1+r1*4-4], m3
151
%endif
152
%endmacro
153

    
154
%macro DEBLOCK_LUMA 1
155
;-----------------------------------------------------------------------------
156
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
157
;-----------------------------------------------------------------------------
158
cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16)
159
    %assign pad 5*mmsize+12-(stack_offset&15)
160
    %define tcm [rsp]
161
    %define ms1 [rsp+mmsize]
162
    %define ms2 [rsp+mmsize*2]
163
    %define am  [rsp+mmsize*3]
164
    %define bm  [rsp+mmsize*4]
165
    SUB        rsp, pad
166
    shl        r2d, 2
167
    shl        r3d, 2
168
    LOAD_AB     m4, m5, r2, r3
169
    mov         r3, 32/mmsize
170
    mov         r2, r0
171
    sub         r0, r1
172
    mova        am, m4
173
    sub         r0, r1
174
    mova        bm, m5
175
    sub         r0, r1
176
.loop:
177
    mova        m0, [r0+r1]
178
    mova        m1, [r0+r1*2]
179
    mova        m2, [r2]
180
    mova        m3, [r2+r1]
181

    
182
    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
183
    LOAD_TC     m6, r4
184
    mova       tcm, m6
185

    
186
    mova        m5, [r0]
187
    LUMA_DEBLOCK_ONE m1, m0, ms1
188
    mova   [r0+r1], m5
189

    
190
    mova        m5, [r2+r1*2]
191
    LUMA_DEBLOCK_ONE m2, m3, ms2
192
    mova   [r2+r1], m5
193

    
194
    pxor        m5, m5
195
    mova        m6, tcm
196
    pcmpgtw     m5, tcm
197
    psubw       m6, ms1
198
    pandn       m5, m7
199
    psubw       m6, ms2
200
    pand        m5, m6
201
    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
202
    mova [r0+r1*2], m1
203
    mova      [r2], m2
204

    
205
    add         r0, mmsize
206
    add         r2, mmsize
207
    add         r4, mmsize/8
208
    dec         r3
209
    jg .loop
210
    ADD         rsp, pad
211
    RET
212

    
213
cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16)
214
    %assign pad 7*mmsize+12-(stack_offset&15)
215
    %define tcm [rsp]
216
    %define ms1 [rsp+mmsize]
217
    %define ms2 [rsp+mmsize*2]
218
    %define p1m [rsp+mmsize*3]
219
    %define p2m [rsp+mmsize*4]
220
    %define am  [rsp+mmsize*5]
221
    %define bm  [rsp+mmsize*6]
222
    SUB        rsp, pad
223
    shl        r2d, 2
224
    shl        r3d, 2
225
    LOAD_AB     m4, m5, r2, r3
226
    mov         r3, r1
227
    mova        am, m4
228
    add         r3, r1
229
    mov         r5, 32/mmsize
230
    mova        bm, m5
231
    add         r3, r1
232
%if mmsize == 16
233
    mov         r2, r0
234
    add         r2, r3
235
%endif
236
.loop:
237
%if mmsize == 8
238
    movq        m2, [r0-8]     ; y q2 q1 q0
239
    movq        m7, [r0+0]
240
    movq        m5, [r0+r1-8]
241
    movq        m3, [r0+r1+0]
242
    movq        m0, [r0+r1*2-8]
243
    movq        m6, [r0+r1*2+0]
244
    movq        m1, [r0+r3-8]
245
    TRANSPOSE4x4W 2, 5, 0, 1, 4
246
    SWAP         2, 7
247
    movq        m7, [r0+r3]
248
    TRANSPOSE4x4W 2, 3, 6, 7, 4
249
%else
250
    movu        m5, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
251
    movu        m0, [r0+r1-8]
252
    movu        m2, [r0+r1*2-8]
253
    movu        m3, [r2-8]
254
    TRANSPOSE4x4W 5, 0, 2, 3, 6
255
    mova       tcm, m3
256

    
257
    movu        m4, [r2+r1-8]
258
    movu        m1, [r2+r1*2-8]
259
    movu        m3, [r2+r3-8]
260
    movu        m7, [r2+r1*4-8]
261
    TRANSPOSE4x4W 4, 1, 3, 7, 6
262

    
263
    mova        m6, tcm
264
    punpcklqdq  m6, m7
265
    punpckhqdq  m5, m4
266
    SBUTTERFLY qdq, 0, 1, 7
267
    SBUTTERFLY qdq, 2, 3, 7
268
%endif
269

    
270
    mova       p2m, m6
271
    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
272
    LOAD_TC     m6, r4
273
    mova       tcm, m6
274

    
275
    LUMA_DEBLOCK_ONE m1, m0, ms1
276
    mova       p1m, m5
277

    
278
    mova        m5, p2m
279
    LUMA_DEBLOCK_ONE m2, m3, ms2
280
    mova       p2m, m5
281

    
282
    pxor        m5, m5
283
    mova        m6, tcm
284
    pcmpgtw     m5, tcm
285
    psubw       m6, ms1
286
    pandn       m5, m7
287
    psubw       m6, ms2
288
    pand        m5, m6
289
    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
290
    mova        m0, p1m
291
    mova        m3, p2m
292
    TRANSPOSE4x4W 0, 1, 2, 3, 4
293
    LUMA_H_STORE r2, r3
294

    
295
    add         r4, mmsize/8
296
    lea         r0, [r0+r1*(mmsize/2)]
297
    lea         r2, [r2+r1*(mmsize/2)]
298
    dec         r5
299
    jg .loop
300
    ADD        rsp, pad
301
    RET
302
%endmacro
303

    
304
INIT_XMM
305
%ifdef ARCH_X86_64
306
; in:  m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
307
;      m12=alpha, m13=beta
308
; out: m0=p1', m3=q1', m1=p0', m2=q0'
309
; clobbers: m4, m5, m6, m7, m10, m11, m14
310
%macro DEBLOCK_LUMA_INTER_SSE2 0
311
    LOAD_MASK   m0, m1, m2, m3, m12, m13, m7, m4, m6
312
    LOAD_TC     m6, r4
313
    DIFF_LT     m8, m1, m13, m10, m4
314
    DIFF_LT     m9, m2, m13, m11, m4
315
    pand        m6, m7
316

    
317
    mova       m14, m6
318
    pxor        m4, m4
319
    pcmpgtw     m6, m4
320
    pand        m6, m14
321

    
322
    mova        m5, m10
323
    pand        m5, m6
324
    LUMA_Q1 m8, m0, m1, m2, m5, m4
325

    
326
    mova        m5, m11
327
    pand        m5, m6
328
    LUMA_Q1 m9, m3, m1, m2, m5, m4
329

    
330
    pxor        m4, m4
331
    psubw       m6, m10
332
    pcmpgtw     m4, m14
333
    pandn       m4, m7
334
    psubw       m6, m11
335
    pand        m4, m6
336
    DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
337

    
338
    SWAP         0, 8
339
    SWAP         3, 9
340
%endmacro
341

    
342
%macro DEBLOCK_LUMA_64 1
343
cglobal deblock_v_luma_10_%1, 5,5,15
344
    %define p2 m8
345
    %define p1 m0
346
    %define p0 m1
347
    %define q0 m2
348
    %define q1 m3
349
    %define q2 m9
350
    %define mask0 m7
351
    %define mask1 m10
352
    %define mask2 m11
353
    shl        r2d, 2
354
    shl        r3d, 2
355
    LOAD_AB    m12, m13, r2, r3
356
    mov         r2, r0
357
    sub         r0, r1
358
    sub         r0, r1
359
    sub         r0, r1
360
    mov         r3, 2
361
.loop:
362
    mova        p2, [r0]
363
    mova        p1, [r0+r1]
364
    mova        p0, [r0+r1*2]
365
    mova        q0, [r2]
366
    mova        q1, [r2+r1]
367
    mova        q2, [r2+r1*2]
368
    DEBLOCK_LUMA_INTER_SSE2
369
    mova   [r0+r1], p1
370
    mova [r0+r1*2], p0
371
    mova      [r2], q0
372
    mova   [r2+r1], q1
373
    add         r0, mmsize
374
    add         r2, mmsize
375
    add         r4, 2
376
    dec         r3
377
    jg .loop
378
    REP_RET
379

    
380
cglobal deblock_h_luma_10_%1, 5,7,15
381
    shl        r2d, 2
382
    shl        r3d, 2
383
    LOAD_AB    m12, m13, r2, r3
384
    mov         r2, r1
385
    add         r2, r1
386
    add         r2, r1
387
    mov         r5, r0
388
    add         r5, r2
389
    mov         r6, 2
390
.loop:
391
    movu        m8, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
392
    movu        m0, [r0+r1-8]
393
    movu        m2, [r0+r1*2-8]
394
    movu        m9, [r5-8]
395
    movu        m5, [r5+r1-8]
396
    movu        m1, [r5+r1*2-8]
397
    movu        m3, [r5+r2-8]
398
    movu        m7, [r5+r1*4-8]
399

    
400
    TRANSPOSE4x4W 8, 0, 2, 9, 10
401
    TRANSPOSE4x4W 5, 1, 3, 7, 10
402

    
403
    punpckhqdq  m8, m5
404
    SBUTTERFLY qdq, 0, 1, 10
405
    SBUTTERFLY qdq, 2, 3, 10
406
    punpcklqdq  m9, m7
407

    
408
    DEBLOCK_LUMA_INTER_SSE2
409

    
410
    TRANSPOSE4x4W 0, 1, 2, 3, 4
411
    LUMA_H_STORE r5, r2
412
    add         r4, 2
413
    lea         r0, [r0+r1*8]
414
    lea         r5, [r5+r1*8]
415
    dec         r6
416
    jg .loop
417
    REP_RET
418
%endmacro
419

    
420
INIT_XMM
421
DEBLOCK_LUMA_64 sse2
422
INIT_AVX
423
DEBLOCK_LUMA_64 avx
424
%endif
425

    
426
%macro SWAPMOVA 2
427
%ifid %1
428
    SWAP %1, %2
429
%else
430
    mova %1, %2
431
%endif
432
%endmacro
433

    
434
; in: t0-t2: tmp registers
435
;     %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
436
;     %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
437
%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
438
%ifdef ARCH_X86_64
439
    paddw     t0, %3, %2
440
    mova      t2, %4
441
    paddw     t2, %3
442
%else
443
    mova      t0, %3
444
    mova      t2, %4
445
    paddw     t0, %2
446
    paddw     t2, %3
447
%endif
448
    paddw     t0, %1
449
    paddw     t2, t2
450
    paddw     t0, %5
451
    paddw     t2, %9
452
    paddw     t0, %9    ; (p2 + p1 + p0 + q0 + 2)
453
    paddw     t2, t0    ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
454

    
455
    psrlw     t2, 3
456
    psrlw     t1, t0, 2
457
    psubw     t2, %3
458
    psubw     t1, %2
459
    pand      t2, %8
460
    pand      t1, %8
461
    paddw     t2, %3
462
    paddw     t1, %2
463
    SWAPMOVA %11, t1
464

    
465
    psubw     t1, t0, %3
466
    paddw     t0, t0
467
    psubw     t1, %5
468
    psubw     t0, %3
469
    paddw     t1, %6
470
    paddw     t1, %2
471
    paddw     t0, %6
472
    psrlw     t1, 2     ; (2*p1 + p0 + q1 + 2)/4
473
    psrlw     t0, 3     ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
474

    
475
    pxor      t0, t1
476
    pxor      t1, %1
477
    pand      t0, %8
478
    pand      t1, %7
479
    pxor      t0, t1
480
    pxor      t0, %1
481
    SWAPMOVA %10, t0
482
    SWAPMOVA %12, t2
483
%endmacro
484

    
485
%macro LUMA_INTRA_INIT 1
486
    %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
487
    %define t0 m4
488
    %define t1 m5
489
    %define t2 m6
490
    %define t3 m7
491
    %assign i 4
492
%rep %1
493
    CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
494
    %assign i i+1
495
%endrep
496
    SUB    rsp, pad
497
%endmacro
498

    
499
; in: %1-%3=tmp, %4=p2, %5=q2
500
%macro LUMA_INTRA_INTER 5
501
    LOAD_AB t0, t1, r2d, r3d
502
    mova    %1, t0
503
    LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
504
%ifdef ARCH_X86_64
505
    mova    %2, t0        ; mask0
506
    psrlw   t3, %1, 2
507
%else
508
    mova    t3, %1
509
    mova    %2, t0        ; mask0
510
    psrlw   t3, 2
511
%endif
512
    paddw   t3, [pw_2]    ; alpha/4+2
513
    DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
514
    pand    t2, %2
515
    mova    t3, %5        ; q2
516
    mova    %1, t2        ; mask1
517
    DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
518
    pand    t2, %1
519
    mova    t3, %4        ; p2
520
    mova    %3, t2        ; mask1q
521
    DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
522
    pand    t2, %1
523
    mova    %1, t2        ; mask1p
524
%endmacro
525

    
526
%macro LUMA_H_INTRA_LOAD 0
527
%if mmsize == 8
528
    movu    t0, [r0-8]
529
    movu    t1, [r0+r1-8]
530
    movu    m0, [r0+r1*2-8]
531
    movu    m1, [r0+r4-8]
532
    TRANSPOSE4x4W 4, 5, 0, 1, 2
533
    mova    t4, t0        ; p3
534
    mova    t5, t1        ; p2
535

    
536
    movu    m2, [r0]
537
    movu    m3, [r0+r1]
538
    movu    t0, [r0+r1*2]
539
    movu    t1, [r0+r4]
540
    TRANSPOSE4x4W 2, 3, 4, 5, 6
541
    mova    t6, t0        ; q2
542
    mova    t7, t1        ; q3
543
%else
544
    movu    t0, [r0-8]
545
    movu    t1, [r0+r1-8]
546
    movu    m0, [r0+r1*2-8]
547
    movu    m1, [r0+r5-8]
548
    movu    m2, [r4-8]
549
    movu    m3, [r4+r1-8]
550
    movu    t2, [r4+r1*2-8]
551
    movu    t3, [r4+r5-8]
552
    TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
553
    mova    t4, t0        ; p3
554
    mova    t5, t1        ; p2
555
    mova    t6, t2        ; q2
556
    mova    t7, t3        ; q3
557
%endif
558
%endmacro
559

    
560
; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
561
%macro LUMA_H_INTRA_STORE 9
562
%if mmsize == 8
563
    TRANSPOSE4x4W %1, %2, %3, %4, %9
564
    movq       [r0-8], m%1
565
    movq       [r0+r1-8], m%2
566
    movq       [r0+r1*2-8], m%3
567
    movq       [r0+r4-8], m%4
568
    movq       m%1, %8
569
    TRANSPOSE4x4W %5, %6, %7, %1, %9
570
    movq       [r0], m%5
571
    movq       [r0+r1], m%6
572
    movq       [r0+r1*2], m%7
573
    movq       [r0+r4], m%1
574
%else
575
    TRANSPOSE2x4x4W %1, %2, %3, %4, %9
576
    movq       [r0-8], m%1
577
    movq       [r0+r1-8], m%2
578
    movq       [r0+r1*2-8], m%3
579
    movq       [r0+r5-8], m%4
580
    movhps     [r4-8], m%1
581
    movhps     [r4+r1-8], m%2
582
    movhps     [r4+r1*2-8], m%3
583
    movhps     [r4+r5-8], m%4
584
%ifnum %8
585
    SWAP       %1, %8
586
%else
587
    mova       m%1, %8
588
%endif
589
    TRANSPOSE2x4x4W %5, %6, %7, %1, %9
590
    movq       [r0], m%5
591
    movq       [r0+r1], m%6
592
    movq       [r0+r1*2], m%7
593
    movq       [r0+r5], m%1
594
    movhps     [r4], m%5
595
    movhps     [r4+r1], m%6
596
    movhps     [r4+r1*2], m%7
597
    movhps     [r4+r5], m%1
598
%endif
599
%endmacro
600

    
601
%ifdef ARCH_X86_64
602
;-----------------------------------------------------------------------------
603
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
604
;-----------------------------------------------------------------------------
605
%macro DEBLOCK_LUMA_INTRA_64 1
606
cglobal deblock_v_luma_intra_10_%1, 4,7,16
607
    %define t0 m1
608
    %define t1 m2
609
    %define t2 m4
610
    %define p2 m8
611
    %define p1 m9
612
    %define p0 m10
613
    %define q0 m11
614
    %define q1 m12
615
    %define q2 m13
616
    %define aa m5
617
    %define bb m14
618
    lea     r4, [r1*4]
619
    lea     r5, [r1*3] ; 3*stride
620
    neg     r4
621
    add     r4, r0     ; pix-4*stride
622
    mov     r6, 2
623
    mova    m0, [pw_2]
624
    shl    r2d, 2
625
    shl    r3d, 2
626
    LOAD_AB aa, bb, r2d, r3d
627
.loop
628
    mova    p2, [r4+r1]
629
    mova    p1, [r4+2*r1]
630
    mova    p0, [r4+r5]
631
    mova    q0, [r0]
632
    mova    q1, [r0+r1]
633
    mova    q2, [r0+2*r1]
634

    
635
    LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
636
    mova    t2, aa
637
    psrlw   t2, 2
638
    paddw   t2, m0 ; alpha/4+2
639
    DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
640
    DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
641
    DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
642
    pand    m6, m3
643
    pand    m7, m6
644
    pand    m6, t1
645
    LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
646
    LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
647
    add     r0, mmsize
648
    add     r4, mmsize
649
    dec     r6
650
    jg .loop
651
    REP_RET
652

    
653
;-----------------------------------------------------------------------------
654
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
655
;-----------------------------------------------------------------------------
656
cglobal deblock_h_luma_intra_10_%1, 4,7,16
657
    %define t0 m15
658
    %define t1 m14
659
    %define t2 m2
660
    %define q3 m5
661
    %define q2 m8
662
    %define q1 m9
663
    %define q0 m10
664
    %define p0 m11
665
    %define p1 m12
666
    %define p2 m13
667
    %define p3 m4
668
    %define spill [rsp]
669
    %assign pad 24-(stack_offset&15)
670
    SUB     rsp, pad
671
    lea     r4, [r1*4]
672
    lea     r5, [r1*3] ; 3*stride
673
    add     r4, r0     ; pix+4*stride
674
    mov     r6, 2
675
    mova    m0, [pw_2]
676
    shl    r2d, 2
677
    shl    r3d, 2
678
.loop
679
    movu    q3, [r0-8]
680
    movu    q2, [r0+r1-8]
681
    movu    q1, [r0+r1*2-8]
682
    movu    q0, [r0+r5-8]
683
    movu    p0, [r4-8]
684
    movu    p1, [r4+r1-8]
685
    movu    p2, [r4+r1*2-8]
686
    movu    p3, [r4+r5-8]
687
    TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
688

    
689
    LOAD_AB m1, m2, r2d, r3d
690
    LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
691
    psrlw   m1, 2
692
    paddw   m1, m0 ; alpha/4+2
693
    DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
694
    DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
695
    DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
696
    pand    m6, m3
697
    pand    m7, m6
698
    pand    m6, t1
699

    
700
    mova spill, q3
701
    LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
702
    LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
703
    mova    m7, spill
704

    
705
    LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
706

    
707
    lea     r0, [r0+r1*8]
708
    lea     r4, [r4+r1*8]
709
    dec     r6
710
    jg .loop
711
    ADD    rsp, pad
712
    RET
713
%endmacro
714

    
715
INIT_XMM
716
DEBLOCK_LUMA_INTRA_64 sse2
717
INIT_AVX
718
DEBLOCK_LUMA_INTRA_64 avx
719

    
720
%endif
721

    
722
%macro DEBLOCK_LUMA_INTRA 1
723
;-----------------------------------------------------------------------------
724
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
725
;-----------------------------------------------------------------------------
726
cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16)
727
    LUMA_INTRA_INIT 3
728
    lea     r4, [r1*4]
729
    lea     r5, [r1*3]
730
    neg     r4
731
    add     r4, r0
732
    mov     r6, 32/mmsize
733
    shl    r2d, 2
734
    shl    r3d, 2
735
.loop:
736
    mova    m0, [r4+r1*2] ; p1
737
    mova    m1, [r4+r5]   ; p0
738
    mova    m2, [r0]      ; q0
739
    mova    m3, [r0+r1]   ; q1
740
    LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
741
    LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
742
    mova    t3, [r0+r1*2] ; q2
743
    LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
744
    add     r0, mmsize
745
    add     r4, mmsize
746
    dec     r6
747
    jg .loop
748
    ADD    rsp, pad
749
    RET
750

    
751
;-----------------------------------------------------------------------------
752
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
753
;-----------------------------------------------------------------------------
754
cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16)
755
    LUMA_INTRA_INIT 8
756
%if mmsize == 8
757
    lea     r4, [r1*3]
758
    mov     r5, 32/mmsize
759
%else
760
    lea     r4, [r1*4]
761
    lea     r5, [r1*3] ; 3*stride
762
    add     r4, r0     ; pix+4*stride
763
    mov     r6, 32/mmsize
764
%endif
765
    shl    r2d, 2
766
    shl    r3d, 2
767
.loop:
768
    LUMA_H_INTRA_LOAD
769
    LUMA_INTRA_INTER t8, t9, t10, t5, t6
770

    
771
    LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
772
    mova    t3, t6     ; q2
773
    LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
774

    
775
    mova    m2, t4
776
    mova    m0, t11
777
    mova    m1, t5
778
    mova    m3, t8
779
    mova    m6, t6
780

    
781
    LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
782

    
783
    lea     r0, [r0+r1*(mmsize/2)]
784
%if mmsize == 8
785
    dec     r5
786
%else
787
    lea     r4, [r4+r1*(mmsize/2)]
788
    dec     r6
789
%endif
790
    jg .loop
791
    ADD    rsp, pad
792
    RET
793
%endmacro
794

    
795
%ifndef ARCH_X86_64
796
INIT_MMX
797
DEBLOCK_LUMA mmxext
798
DEBLOCK_LUMA_INTRA mmxext
799
INIT_XMM
800
DEBLOCK_LUMA sse2
801
DEBLOCK_LUMA_INTRA sse2
802
INIT_AVX
803
DEBLOCK_LUMA avx
804
DEBLOCK_LUMA_INTRA avx
805
%endif
806

    
807
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
808
; out: %1=p0', %2=q0'
809
%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
810
    mova    %6, [pw_2]
811
    paddw   %6, %3
812
    paddw   %6, %4
813
    paddw   %7, %6, %2
814
    paddw   %6, %1
815
    paddw   %6, %3
816
    paddw   %7, %4
817
    psraw   %6, 2
818
    psraw   %7, 2
819
    psubw   %6, %1
820
    psubw   %7, %2
821
    pand    %6, %5
822
    pand    %7, %5
823
    paddw   %1, %6
824
    paddw   %2, %7
825
%endmacro
826

    
827
%macro CHROMA_V_LOAD 1
828
    mova        m0, [r0]    ; p1
829
    mova        m1, [r0+r1] ; p0
830
    mova        m2, [%1]    ; q0
831
    mova        m3, [%1+r1] ; q1
832
%endmacro
833

    
834
%macro CHROMA_V_STORE 0
835
    mova [r0+1*r1], m1
836
    mova [r0+2*r1], m2
837
%endmacro
838

    
839
%macro DEBLOCK_CHROMA 1
840
;-----------------------------------------------------------------------------
841
; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
842
;-----------------------------------------------------------------------------
843
cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16)
844
    mov         r5, r0
845
    sub         r0, r1
846
    sub         r0, r1
847
    shl        r2d, 2
848
    shl        r3d, 2
849
%if mmsize < 16
850
    mov         r6, 16/mmsize
851
.loop:
852
%endif
853
    CHROMA_V_LOAD r5
854
    LOAD_AB     m4, m5, r2, r3
855
    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
856
    pxor        m4, m4
857
    LOAD_TC     m6, r4
858
    psubw       m6, [pw_3]
859
    pmaxsw      m6, m4
860
    pand        m7, m6
861
    DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
862
    CHROMA_V_STORE
863
%if mmsize < 16
864
    add         r0, mmsize
865
    add         r5, mmsize
866
    add         r4, mmsize/8
867
    dec         r6
868
    jg .loop
869
    REP_RET
870
%else
871
    RET
872
%endif
873

    
874
;-----------------------------------------------------------------------------
875
; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
876
;-----------------------------------------------------------------------------
877
cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16)
878
    mov         r4, r0
879
    sub         r0, r1
880
    sub         r0, r1
881
    shl        r2d, 2
882
    shl        r3d, 2
883
%if mmsize < 16
884
    mov         r5, 16/mmsize
885
.loop:
886
%endif
887
    CHROMA_V_LOAD r4
888
    LOAD_AB     m4, m5, r2, r3
889
    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
890
    CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
891
    CHROMA_V_STORE
892
%if mmsize < 16
893
    add         r0, mmsize
894
    add         r4, mmsize
895
    dec         r5
896
    jg .loop
897
    REP_RET
898
%else
899
    RET
900
%endif
901
%endmacro
902

    
903
%ifndef ARCH_X86_64
904
INIT_MMX
905
DEBLOCK_CHROMA mmxext
906
%endif
907
INIT_XMM
908
DEBLOCK_CHROMA sse2
909
INIT_AVX
910
DEBLOCK_CHROMA avx