Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / h264_deblock_10bit.asm @ 9f3d6ca4

History | View | Annotate | Download (19.7 KB)

1
;*****************************************************************************
2
;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
3
;*****************************************************************************
4
;* Copyright (C) 2005-2011 x264 project
5
;*
6
;* Authors: Oskar Arvidsson <oskar@irock.se>
7
;*          Loren Merritt <lorenm@u.washington.edu>
8
;*          Jason Garrett-Glaser <darkshikari@gmail.com>
9
;*
10
;* This file is part of Libav.
11
;*
12
;* Libav is free software; you can redistribute it and/or
13
;* modify it under the terms of the GNU Lesser General Public
14
;* License as published by the Free Software Foundation; either
15
;* version 2.1 of the License, or (at your option) any later version.
16
;*
17
;* Libav is distributed in the hope that it will be useful,
18
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
;* Lesser General Public License for more details.
21
;*
22
;* You should have received a copy of the GNU Lesser General Public
23
;* License along with Libav; if not, write to the Free Software
24
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
;******************************************************************************
26

    
27
%include "x86inc.asm"
28
%include "x86util.asm"
29

    
30
SECTION_RODATA
31

    
32
pw_pixel_max: times 8 dw ((1 << 10)-1)
33

    
34
SECTION .text
35

    
36
cextern pw_2
37
cextern pw_4
38

    
39
; out: %4 = |%1-%2|-%3
40
; clobbers: %5
41
%macro ABS_SUB 5
42
    psubusw %5, %2, %1
43
    psubusw %4, %1, %2
44
    por     %4, %5
45
    psubw   %4, %3
46
%endmacro
47

    
48
; out: %4 = |%1-%2|<%3
49
%macro DIFF_LT   5
50
    psubusw %4, %2, %1
51
    psubusw %5, %1, %2
52
    por     %5, %4 ; |%1-%2|
53
    pxor    %4, %4
54
    psubw   %5, %3 ; |%1-%2|-%3
55
    pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
56
%endmacro
57

    
58
%macro LOAD_AB 4
59
    movd       %1, %3
60
    movd       %2, %4
61
    SPLATW     %1, %1
62
    SPLATW     %2, %2
63
%endmacro
64

    
65
; in:  %2=tc reg
66
; out: %1=splatted tc
67
%macro LOAD_TC 2
68
    movd        %1, [%2]
69
    punpcklbw   %1, %1
70
%if mmsize == 8
71
    pshufw      %1, %1, 0
72
%else
73
    pshuflw     %1, %1, 01010000b
74
    pshufd      %1, %1, 01010000b
75
%endif
76
    psraw       %1, 6
77
%endmacro
78

    
79
; in: %1=p1, %2=p0, %3=q0, %4=q1
80
;     %5=alpha, %6=beta, %7-%9=tmp
81
; out: %7=mask
82
%macro LOAD_MASK 9
83
    ABS_SUB     %2, %3, %5, %8, %7 ; |p0-q0| - alpha
84
    ABS_SUB     %1, %2, %6, %9, %7 ; |p1-p0| - beta
85
    pand        %8, %9
86
    ABS_SUB     %3, %4, %6, %9, %7 ; |q1-q0| - beta
87
    pxor        %7, %7
88
    pand        %8, %9
89
    pcmpgtw     %7, %8
90
%endmacro
91

    
92
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
93
; out: %1=p0', m2=q0'
94
%macro DEBLOCK_P0_Q0 7
95
    psubw   %3, %4
96
    pxor    %7, %7
97
    paddw   %3, [pw_4]
98
    psubw   %7, %5
99
    psubw   %6, %2, %1
100
    psllw   %6, 2
101
    paddw   %3, %6
102
    psraw   %3, 3
103
    mova    %6, [pw_pixel_max]
104
    CLIPW   %3, %7, %5
105
    pxor    %7, %7
106
    paddw   %1, %3
107
    psubw   %2, %3
108
    CLIPW   %1, %7, %6
109
    CLIPW   %2, %7, %6
110
%endmacro
111

    
112
; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
113
%macro LUMA_Q1 6
114
    pavgw       %6, %3, %4      ; (p0+q0+1)>>1
115
    paddw       %1, %6
116
    pxor        %6, %6
117
    psraw       %1, 1
118
    psubw       %6, %5
119
    psubw       %1, %2
120
    CLIPW       %1, %6, %5
121
    paddw       %1, %2
122
%endmacro
123

    
124
%macro LUMA_DEBLOCK_ONE 3
125
    DIFF_LT     m5, %1, bm, m4, m6
126
    pxor        m6, m6
127
    mova        %3, m4
128
    pcmpgtw     m6, tcm
129
    pand        m4, tcm
130
    pandn       m6, m7
131
    pand        m4, m6
132
    LUMA_Q1 m5, %2, m1, m2, m4, m6
133
%endmacro
134

    
135
%macro LUMA_H_STORE 2
136
%if mmsize == 8
137
    movq        [r0-4], m0
138
    movq        [r0+r1-4], m1
139
    movq        [r0+r1*2-4], m2
140
    movq        [r0+%2-4], m3
141
%else
142
    movq        [r0-4], m0
143
    movhps      [r0+r1-4], m0
144
    movq        [r0+r1*2-4], m1
145
    movhps      [%1-4], m1
146
    movq        [%1+r1-4], m2
147
    movhps      [%1+r1*2-4], m2
148
    movq        [%1+%2-4], m3
149
    movhps      [%1+r1*4-4], m3
150
%endif
151
%endmacro
152

    
153
%macro DEBLOCK_LUMA 1
154
;-----------------------------------------------------------------------------
155
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
156
;-----------------------------------------------------------------------------
157
cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16)
158
    %assign pad 5*mmsize+12-(stack_offset&15)
159
    %define tcm [rsp]
160
    %define ms1 [rsp+mmsize]
161
    %define ms2 [rsp+mmsize*2]
162
    %define am  [rsp+mmsize*3]
163
    %define bm  [rsp+mmsize*4]
164
    SUB        rsp, pad
165
    shl        r2d, 2
166
    shl        r3d, 2
167
    LOAD_AB     m4, m5, r2, r3
168
    mov         r3, 32/mmsize
169
    mov         r2, r0
170
    sub         r0, r1
171
    mova        am, m4
172
    sub         r0, r1
173
    mova        bm, m5
174
    sub         r0, r1
175
.loop:
176
    mova        m0, [r0+r1]
177
    mova        m1, [r0+r1*2]
178
    mova        m2, [r2]
179
    mova        m3, [r2+r1]
180

    
181
    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
182
    LOAD_TC     m6, r4
183
    mova       tcm, m6
184

    
185
    mova        m5, [r0]
186
    LUMA_DEBLOCK_ONE m1, m0, ms1
187
    mova   [r0+r1], m5
188

    
189
    mova        m5, [r2+r1*2]
190
    LUMA_DEBLOCK_ONE m2, m3, ms2
191
    mova   [r2+r1], m5
192

    
193
    pxor        m5, m5
194
    mova        m6, tcm
195
    pcmpgtw     m5, tcm
196
    psubw       m6, ms1
197
    pandn       m5, m7
198
    psubw       m6, ms2
199
    pand        m5, m6
200
    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
201
    mova [r0+r1*2], m1
202
    mova      [r2], m2
203

    
204
    add         r0, mmsize
205
    add         r2, mmsize
206
    add         r4, mmsize/8
207
    dec         r3
208
    jg .loop
209
    ADD         rsp, pad
210
    RET
211

    
212
cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16)
213
    %assign pad 7*mmsize+12-(stack_offset&15)
214
    %define tcm [rsp]
215
    %define ms1 [rsp+mmsize]
216
    %define ms2 [rsp+mmsize*2]
217
    %define p1m [rsp+mmsize*3]
218
    %define p2m [rsp+mmsize*4]
219
    %define am  [rsp+mmsize*5]
220
    %define bm  [rsp+mmsize*6]
221
    SUB        rsp, pad
222
    shl        r2d, 2
223
    shl        r3d, 2
224
    LOAD_AB     m4, m5, r2, r3
225
    mov         r3, r1
226
    mova        am, m4
227
    add         r3, r1
228
    mov         r5, 32/mmsize
229
    mova        bm, m5
230
    add         r3, r1
231
%if mmsize == 16
232
    mov         r2, r0
233
    add         r2, r3
234
%endif
235
.loop:
236
%if mmsize == 8
237
    movq        m2, [r0-8]     ; y q2 q1 q0
238
    movq        m7, [r0+0]
239
    movq        m5, [r0+r1-8]
240
    movq        m3, [r0+r1+0]
241
    movq        m0, [r0+r1*2-8]
242
    movq        m6, [r0+r1*2+0]
243
    movq        m1, [r0+r3-8]
244
    TRANSPOSE4x4W 2, 5, 0, 1, 4
245
    SWAP         2, 7
246
    movq        m7, [r0+r3]
247
    TRANSPOSE4x4W 2, 3, 6, 7, 4
248
%else
249
    movu        m5, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
250
    movu        m0, [r0+r1-8]
251
    movu        m2, [r0+r1*2-8]
252
    movu        m3, [r2-8]
253
    TRANSPOSE4x4W 5, 0, 2, 3, 6
254
    mova       tcm, m3
255

    
256
    movu        m4, [r2+r1-8]
257
    movu        m1, [r2+r1*2-8]
258
    movu        m3, [r2+r3-8]
259
    movu        m7, [r2+r1*4-8]
260
    TRANSPOSE4x4W 4, 1, 3, 7, 6
261

    
262
    mova        m6, tcm
263
    punpcklqdq  m6, m7
264
    punpckhqdq  m5, m4
265
    SBUTTERFLY qdq, 0, 1, 7
266
    SBUTTERFLY qdq, 2, 3, 7
267
%endif
268

    
269
    mova       p2m, m6
270
    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
271
    LOAD_TC     m6, r4
272
    mova       tcm, m6
273

    
274
    LUMA_DEBLOCK_ONE m1, m0, ms1
275
    mova       p1m, m5
276

    
277
    mova        m5, p2m
278
    LUMA_DEBLOCK_ONE m2, m3, ms2
279
    mova       p2m, m5
280

    
281
    pxor        m5, m5
282
    mova        m6, tcm
283
    pcmpgtw     m5, tcm
284
    psubw       m6, ms1
285
    pandn       m5, m7
286
    psubw       m6, ms2
287
    pand        m5, m6
288
    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
289
    mova        m0, p1m
290
    mova        m3, p2m
291
    TRANSPOSE4x4W 0, 1, 2, 3, 4
292
    LUMA_H_STORE r2, r3
293

    
294
    add         r4, mmsize/8
295
    lea         r0, [r0+r1*(mmsize/2)]
296
    lea         r2, [r2+r1*(mmsize/2)]
297
    dec         r5
298
    jg .loop
299
    ADD        rsp, pad
300
    RET
301
%endmacro
302

    
303
INIT_XMM
304
%ifdef ARCH_X86_64
305
; in:  m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
306
;      m12=alpha, m13=beta
307
; out: m0=p1', m3=q1', m1=p0', m2=q0'
308
; clobbers: m4, m5, m6, m7, m10, m11, m14
309
%macro DEBLOCK_LUMA_INTER_SSE2 0
310
    LOAD_MASK   m0, m1, m2, m3, m12, m13, m7, m4, m6
311
    LOAD_TC     m6, r4
312
    DIFF_LT     m8, m1, m13, m10, m4
313
    DIFF_LT     m9, m2, m13, m11, m4
314
    pand        m6, m7
315

    
316
    mova       m14, m6
317
    pxor        m4, m4
318
    pcmpgtw     m6, m4
319
    pand        m6, m14
320

    
321
    mova        m5, m10
322
    pand        m5, m6
323
    LUMA_Q1 m8, m0, m1, m2, m5, m4
324

    
325
    mova        m5, m11
326
    pand        m5, m6
327
    LUMA_Q1 m9, m3, m1, m2, m5, m4
328

    
329
    pxor        m4, m4
330
    psubw       m6, m10
331
    pcmpgtw     m4, m14
332
    pandn       m4, m7
333
    psubw       m6, m11
334
    pand        m4, m6
335
    DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
336

    
337
    SWAP         0, 8
338
    SWAP         3, 9
339
%endmacro
340

    
341
%macro DEBLOCK_LUMA_64 1
342
cglobal deblock_v_luma_10_%1, 5,5,15
343
    %define p2 m8
344
    %define p1 m0
345
    %define p0 m1
346
    %define q0 m2
347
    %define q1 m3
348
    %define q2 m9
349
    %define mask0 m7
350
    %define mask1 m10
351
    %define mask2 m11
352
    shl        r2d, 2
353
    shl        r3d, 2
354
    LOAD_AB    m12, m13, r2, r3
355
    mov         r2, r0
356
    sub         r0, r1
357
    sub         r0, r1
358
    sub         r0, r1
359
    mov         r3, 2
360
.loop:
361
    mova        p2, [r0]
362
    mova        p1, [r0+r1]
363
    mova        p0, [r0+r1*2]
364
    mova        q0, [r2]
365
    mova        q1, [r2+r1]
366
    mova        q2, [r2+r1*2]
367
    DEBLOCK_LUMA_INTER_SSE2
368
    mova   [r0+r1], p1
369
    mova [r0+r1*2], p0
370
    mova      [r2], q0
371
    mova   [r2+r1], q1
372
    add         r0, mmsize
373
    add         r2, mmsize
374
    add         r4, 2
375
    dec         r3
376
    jg .loop
377
    REP_RET
378

    
379
cglobal deblock_h_luma_10_%1, 5,7,15
380
    shl        r2d, 2
381
    shl        r3d, 2
382
    LOAD_AB    m12, m13, r2, r3
383
    mov         r2, r1
384
    add         r2, r1
385
    add         r2, r1
386
    mov         r5, r0
387
    add         r5, r2
388
    mov         r6, 2
389
.loop:
390
    movu        m8, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
391
    movu        m0, [r0+r1-8]
392
    movu        m2, [r0+r1*2-8]
393
    movu        m9, [r5-8]
394
    movu        m5, [r5+r1-8]
395
    movu        m1, [r5+r1*2-8]
396
    movu        m3, [r5+r2-8]
397
    movu        m7, [r5+r1*4-8]
398

    
399
    TRANSPOSE4x4W 8, 0, 2, 9, 10
400
    TRANSPOSE4x4W 5, 1, 3, 7, 10
401

    
402
    punpckhqdq  m8, m5
403
    SBUTTERFLY qdq, 0, 1, 10
404
    SBUTTERFLY qdq, 2, 3, 10
405
    punpcklqdq  m9, m7
406

    
407
    DEBLOCK_LUMA_INTER_SSE2
408

    
409
    TRANSPOSE4x4W 0, 1, 2, 3, 4
410
    LUMA_H_STORE r5, r2
411
    add         r4, 2
412
    lea         r0, [r0+r1*8]
413
    lea         r5, [r5+r1*8]
414
    dec         r6
415
    jg .loop
416
    REP_RET
417
%endmacro
418

    
419
INIT_XMM
420
DEBLOCK_LUMA_64 sse2
421
INIT_AVX
422
DEBLOCK_LUMA_64 avx
423
%endif
424

    
425
%macro SWAPMOVA 2
426
%ifid %1
427
    SWAP %1, %2
428
%else
429
    mova %1, %2
430
%endif
431
%endmacro
432

    
433
; in: t0-t2: tmp registers
434
;     %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
435
;     %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
436
%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
437
%ifdef ARCH_X86_64
438
    paddw     t0, %3, %2
439
    mova      t2, %4
440
    paddw     t2, %3
441
%else
442
    mova      t0, %3
443
    mova      t2, %4
444
    paddw     t0, %2
445
    paddw     t2, %3
446
%endif
447
    paddw     t0, %1
448
    paddw     t2, t2
449
    paddw     t0, %5
450
    paddw     t2, %9
451
    paddw     t0, %9    ; (p2 + p1 + p0 + q0 + 2)
452
    paddw     t2, t0    ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
453

    
454
    psrlw     t2, 3
455
    psrlw     t1, t0, 2
456
    psubw     t2, %3
457
    psubw     t1, %2
458
    pand      t2, %8
459
    pand      t1, %8
460
    paddw     t2, %3
461
    paddw     t1, %2
462
    SWAPMOVA %11, t1
463

    
464
    psubw     t1, t0, %3
465
    paddw     t0, t0
466
    psubw     t1, %5
467
    psubw     t0, %3
468
    paddw     t1, %6
469
    paddw     t1, %2
470
    paddw     t0, %6
471
    psrlw     t1, 2     ; (2*p1 + p0 + q1 + 2)/4
472
    psrlw     t0, 3     ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
473

    
474
    pxor      t0, t1
475
    pxor      t1, %1
476
    pand      t0, %8
477
    pand      t1, %7
478
    pxor      t0, t1
479
    pxor      t0, %1
480
    SWAPMOVA %10, t0
481
    SWAPMOVA %12, t2
482
%endmacro
483

    
484
%macro LUMA_INTRA_INIT 1
485
    %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
486
    %define t0 m4
487
    %define t1 m5
488
    %define t2 m6
489
    %define t3 m7
490
    %assign i 4
491
%rep %1
492
    CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
493
    %assign i i+1
494
%endrep
495
    SUB    rsp, pad
496
%endmacro
497

    
498
; in: %1-%3=tmp, %4=p2, %5=q2
499
%macro LUMA_INTRA_INTER 5
500
    LOAD_AB t0, t1, r2d, r3d
501
    mova    %1, t0
502
    LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
503
%ifdef ARCH_X86_64
504
    mova    %2, t0        ; mask0
505
    psrlw   t3, %1, 2
506
%else
507
    mova    t3, %1
508
    mova    %2, t0        ; mask0
509
    psrlw   t3, 2
510
%endif
511
    paddw   t3, [pw_2]    ; alpha/4+2
512
    DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
513
    pand    t2, %2
514
    mova    t3, %5        ; q2
515
    mova    %1, t2        ; mask1
516
    DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
517
    pand    t2, %1
518
    mova    t3, %4        ; p2
519
    mova    %3, t2        ; mask1q
520
    DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
521
    pand    t2, %1
522
    mova    %1, t2        ; mask1p
523
%endmacro
524

    
525
%macro LUMA_H_INTRA_LOAD 0
526
%if mmsize == 8
527
    movu    t0, [r0-8]
528
    movu    t1, [r0+r1-8]
529
    movu    m0, [r0+r1*2-8]
530
    movu    m1, [r0+r4-8]
531
    TRANSPOSE4x4W 4, 5, 0, 1, 2
532
    mova    t4, t0        ; p3
533
    mova    t5, t1        ; p2
534

    
535
    movu    m2, [r0]
536
    movu    m3, [r0+r1]
537
    movu    t0, [r0+r1*2]
538
    movu    t1, [r0+r4]
539
    TRANSPOSE4x4W 2, 3, 4, 5, 6
540
    mova    t6, t0        ; q2
541
    mova    t7, t1        ; q3
542
%else
543
    movu    t0, [r0-8]
544
    movu    t1, [r0+r1-8]
545
    movu    m0, [r0+r1*2-8]
546
    movu    m1, [r0+r5-8]
547
    movu    m2, [r4-8]
548
    movu    m3, [r4+r1-8]
549
    movu    t2, [r4+r1*2-8]
550
    movu    t3, [r4+r5-8]
551
    TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
552
    mova    t4, t0        ; p3
553
    mova    t5, t1        ; p2
554
    mova    t6, t2        ; q2
555
    mova    t7, t3        ; q3
556
%endif
557
%endmacro
558

    
559
; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
560
%macro LUMA_H_INTRA_STORE 9
561
%if mmsize == 8
562
    TRANSPOSE4x4W %1, %2, %3, %4, %9
563
    movq       [r0-8], m%1
564
    movq       [r0+r1-8], m%2
565
    movq       [r0+r1*2-8], m%3
566
    movq       [r0+r4-8], m%4
567
    movq       m%1, %8
568
    TRANSPOSE4x4W %5, %6, %7, %1, %9
569
    movq       [r0], m%5
570
    movq       [r0+r1], m%6
571
    movq       [r0+r1*2], m%7
572
    movq       [r0+r4], m%1
573
%else
574
    TRANSPOSE2x4x4W %1, %2, %3, %4, %9
575
    movq       [r0-8], m%1
576
    movq       [r0+r1-8], m%2
577
    movq       [r0+r1*2-8], m%3
578
    movq       [r0+r5-8], m%4
579
    movhps     [r4-8], m%1
580
    movhps     [r4+r1-8], m%2
581
    movhps     [r4+r1*2-8], m%3
582
    movhps     [r4+r5-8], m%4
583
%ifnum %8
584
    SWAP       %1, %8
585
%else
586
    mova       m%1, %8
587
%endif
588
    TRANSPOSE2x4x4W %5, %6, %7, %1, %9
589
    movq       [r0], m%5
590
    movq       [r0+r1], m%6
591
    movq       [r0+r1*2], m%7
592
    movq       [r0+r5], m%1
593
    movhps     [r4], m%5
594
    movhps     [r4+r1], m%6
595
    movhps     [r4+r1*2], m%7
596
    movhps     [r4+r5], m%1
597
%endif
598
%endmacro
599

    
600
%ifdef ARCH_X86_64
601
;-----------------------------------------------------------------------------
602
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
603
;-----------------------------------------------------------------------------
604
%macro DEBLOCK_LUMA_INTRA_64 1
605
cglobal deblock_v_luma_intra_10_%1, 4,7,16
606
    %define t0 m1
607
    %define t1 m2
608
    %define t2 m4
609
    %define p2 m8
610
    %define p1 m9
611
    %define p0 m10
612
    %define q0 m11
613
    %define q1 m12
614
    %define q2 m13
615
    %define aa m5
616
    %define bb m14
617
    lea     r4, [r1*4]
618
    lea     r5, [r1*3] ; 3*stride
619
    neg     r4
620
    add     r4, r0     ; pix-4*stride
621
    mov     r6, 2
622
    mova    m0, [pw_2]
623
    shl    r2d, 2
624
    shl    r3d, 2
625
    LOAD_AB aa, bb, r2d, r3d
626
.loop
627
    mova    p2, [r4+r1]
628
    mova    p1, [r4+2*r1]
629
    mova    p0, [r4+r5]
630
    mova    q0, [r0]
631
    mova    q1, [r0+r1]
632
    mova    q2, [r0+2*r1]
633

    
634
    LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
635
    mova    t2, aa
636
    psrlw   t2, 2
637
    paddw   t2, m0 ; alpha/4+2
638
    DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
639
    DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
640
    DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
641
    pand    m6, m3
642
    pand    m7, m6
643
    pand    m6, t1
644
    LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
645
    LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
646
    add     r0, mmsize
647
    add     r4, mmsize
648
    dec     r6
649
    jg .loop
650
    REP_RET
651

    
652
;-----------------------------------------------------------------------------
653
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
654
;-----------------------------------------------------------------------------
655
cglobal deblock_h_luma_intra_10_%1, 4,7,16
656
    %define t0 m15
657
    %define t1 m14
658
    %define t2 m2
659
    %define q3 m5
660
    %define q2 m8
661
    %define q1 m9
662
    %define q0 m10
663
    %define p0 m11
664
    %define p1 m12
665
    %define p2 m13
666
    %define p3 m4
667
    %define spill [rsp]
668
    %assign pad 24-(stack_offset&15)
669
    SUB     rsp, pad
670
    lea     r4, [r1*4]
671
    lea     r5, [r1*3] ; 3*stride
672
    add     r4, r0     ; pix+4*stride
673
    mov     r6, 2
674
    mova    m0, [pw_2]
675
    shl    r2d, 2
676
    shl    r3d, 2
677
.loop
678
    movu    q3, [r0-8]
679
    movu    q2, [r0+r1-8]
680
    movu    q1, [r0+r1*2-8]
681
    movu    q0, [r0+r5-8]
682
    movu    p0, [r4-8]
683
    movu    p1, [r4+r1-8]
684
    movu    p2, [r4+r1*2-8]
685
    movu    p3, [r4+r5-8]
686
    TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
687

    
688
    LOAD_AB m1, m2, r2d, r3d
689
    LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
690
    psrlw   m1, 2
691
    paddw   m1, m0 ; alpha/4+2
692
    DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
693
    DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
694
    DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
695
    pand    m6, m3
696
    pand    m7, m6
697
    pand    m6, t1
698

    
699
    mova spill, q3
700
    LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
701
    LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
702
    mova    m7, spill
703

    
704
    LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
705

    
706
    lea     r0, [r0+r1*8]
707
    lea     r4, [r4+r1*8]
708
    dec     r6
709
    jg .loop
710
    ADD    rsp, pad
711
    RET
712
%endmacro
713

    
714
INIT_XMM
715
DEBLOCK_LUMA_INTRA_64 sse2
716
INIT_AVX
717
DEBLOCK_LUMA_INTRA_64 avx
718

    
719
%endif
720

    
721
%macro DEBLOCK_LUMA_INTRA 1
722
;-----------------------------------------------------------------------------
723
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
724
;-----------------------------------------------------------------------------
725
cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16)
726
    LUMA_INTRA_INIT 3
727
    lea     r4, [r1*4]
728
    lea     r5, [r1*3]
729
    neg     r4
730
    add     r4, r0
731
    mov     r6, 32/mmsize
732
    shl    r2d, 2
733
    shl    r3d, 2
734
.loop:
735
    mova    m0, [r4+r1*2] ; p1
736
    mova    m1, [r4+r5]   ; p0
737
    mova    m2, [r0]      ; q0
738
    mova    m3, [r0+r1]   ; q1
739
    LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
740
    LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
741
    mova    t3, [r0+r1*2] ; q2
742
    LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
743
    add     r0, mmsize
744
    add     r4, mmsize
745
    dec     r6
746
    jg .loop
747
    ADD    rsp, pad
748
    RET
749

    
750
;-----------------------------------------------------------------------------
751
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
752
;-----------------------------------------------------------------------------
753
cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16)
754
    LUMA_INTRA_INIT 8
755
%if mmsize == 8
756
    lea     r4, [r1*3]
757
    mov     r5, 32/mmsize
758
%else
759
    lea     r4, [r1*4]
760
    lea     r5, [r1*3] ; 3*stride
761
    add     r4, r0     ; pix+4*stride
762
    mov     r6, 32/mmsize
763
%endif
764
    shl    r2d, 2
765
    shl    r3d, 2
766
.loop:
767
    LUMA_H_INTRA_LOAD
768
    LUMA_INTRA_INTER t8, t9, t10, t5, t6
769

    
770
    LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
771
    mova    t3, t6     ; q2
772
    LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
773

    
774
    mova    m2, t4
775
    mova    m0, t11
776
    mova    m1, t5
777
    mova    m3, t8
778
    mova    m6, t6
779

    
780
    LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
781

    
782
    lea     r0, [r0+r1*(mmsize/2)]
783
%if mmsize == 8
784
    dec     r5
785
%else
786
    lea     r4, [r4+r1*(mmsize/2)]
787
    dec     r6
788
%endif
789
    jg .loop
790
    ADD    rsp, pad
791
    RET
792
%endmacro
793

    
794
%ifndef ARCH_X86_64
795
INIT_MMX
796
DEBLOCK_LUMA mmxext
797
DEBLOCK_LUMA_INTRA mmxext
798
INIT_XMM
799
DEBLOCK_LUMA sse2
800
DEBLOCK_LUMA_INTRA sse2
801
INIT_AVX
802
DEBLOCK_LUMA avx
803
DEBLOCK_LUMA_INTRA avx
804
%endif