blob: 6474f43b3da2f0e5cc2f8346fa4a9f6590cd8e5b [file] [log] [blame]
DRC246c3d92009-06-25 20:38:31 +00001;
2; jdmrgss2.asm - merged upsampling/color conversion (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright 2009 D. R. Commander
6;
7; Based on
8; x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; [TAB8]
19
20%include "jcolsamp.inc"
21
22; --------------------------------------------------------------------------
23 SECTION SEG_TEXT
24 BITS 64
25;
26; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
27;
28; GLOBAL(void)
29; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
30; JSAMPIMAGE input_buf,
31; JDIMENSION in_row_group_ctr,
32; JSAMPARRAY output_buf);
33;
34
35; r10 = JDIMENSION output_width
36; r11 = JSAMPIMAGE input_buf
37; r12 = JDIMENSION in_row_group_ctr
38; r13 = JSAMPARRAY output_buf
39
40%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
41%define WK_NUM 3
42
43 align 16
44 global EXTN(jsimd_h2v1_merged_upsample_sse2)
45
46EXTN(jsimd_h2v1_merged_upsample_sse2):
47 push rbp
48 mov rax,rsp ; rax = original rbp
49 sub rsp, byte 4
50 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
51 mov [rsp],rax
52 mov rbp,rsp ; rbp = aligned rbp
53 lea rsp, [wk(0)]
54 push rbx
55 collect_args
56
57 mov rcx, r10 ; col
58 test rcx,rcx
59 jz near .return
60
61 push rcx
62
63 mov rdi, r11
64 mov rcx, r12
65 mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
66 mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
67 mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
68 mov rdi, r13
69 mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
70 mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
71 mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
72 mov rdi, JSAMPROW [rdi] ; outptr
73
74 pop rcx ; col
75
76.columnloop:
77
78 movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF)
79 movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF)
80
81 pxor xmm1,xmm1 ; xmm1=(all 0's)
82 pcmpeqw xmm3,xmm3
83 psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
84
85 movdqa xmm4,xmm6
86 punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH
87 punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL
88 movdqa xmm0,xmm7
89 punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH
90 punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL
91
92 paddw xmm6,xmm3
93 paddw xmm4,xmm3
94 paddw xmm7,xmm3
95 paddw xmm0,xmm3
96
97 ; (Original)
98 ; R = Y + 1.40200 * Cr
99 ; G = Y - 0.34414 * Cb - 0.71414 * Cr
100 ; B = Y + 1.77200 * Cb
101 ;
102 ; (This implementation)
103 ; R = Y + 0.40200 * Cr + Cr
104 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
105 ; B = Y - 0.22800 * Cb + Cb + Cb
106
107 movdqa xmm5,xmm6 ; xmm5=CbH
108 movdqa xmm2,xmm4 ; xmm2=CbL
109 paddw xmm6,xmm6 ; xmm6=2*CbH
110 paddw xmm4,xmm4 ; xmm4=2*CbL
111 movdqa xmm1,xmm7 ; xmm1=CrH
112 movdqa xmm3,xmm0 ; xmm3=CrL
113 paddw xmm7,xmm7 ; xmm7=2*CrH
114 paddw xmm0,xmm0 ; xmm0=2*CrL
115
116 pmulhw xmm6,[PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800))
117 pmulhw xmm4,[PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800))
118 pmulhw xmm7,[PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
119 pmulhw xmm0,[PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
120
121 paddw xmm6,[PW_ONE]
122 paddw xmm4,[PW_ONE]
123 psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800))
124 psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800))
125 paddw xmm7,[PW_ONE]
126 paddw xmm0,[PW_ONE]
127 psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200))
128 psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200))
129
130 paddw xmm6,xmm5
131 paddw xmm4,xmm2
132 paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
133 paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
134 paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
135 paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
136
137 movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
138 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
139
140 movdqa xmm6,xmm5
141 movdqa xmm7,xmm2
142 punpcklwd xmm5,xmm1
143 punpckhwd xmm6,xmm1
144 pmaddwd xmm5,[PW_MF0344_F0285]
145 pmaddwd xmm6,[PW_MF0344_F0285]
146 punpcklwd xmm2,xmm3
147 punpckhwd xmm7,xmm3
148 pmaddwd xmm2,[PW_MF0344_F0285]
149 pmaddwd xmm7,[PW_MF0344_F0285]
150
151 paddd xmm5,[PD_ONEHALF]
152 paddd xmm6,[PD_ONEHALF]
153 psrad xmm5,SCALEBITS
154 psrad xmm6,SCALEBITS
155 paddd xmm2,[PD_ONEHALF]
156 paddd xmm7,[PD_ONEHALF]
157 psrad xmm2,SCALEBITS
158 psrad xmm7,SCALEBITS
159
160 packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
161 packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
162 psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
163 psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
164
165 movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
166
167 mov al,2 ; Yctr
168 jmp short .Yloop_1st
169
170.Yloop_2nd:
171 movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
172 movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
173 movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
174
175.Yloop_1st:
176 movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF)
177
178 pcmpeqw xmm6,xmm6
179 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
180 pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE
181 psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO
182
183 movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H)
184 movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H)
185 movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H)
186
187 paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
188 paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
189 packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
190 packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
191
192 paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
193 paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
194 packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
195 packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
196
197 paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
198 paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
199 packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
200 packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
201
202%if RGB_PIXELSIZE == 3 ; ---------------
203
204 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
205 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
206 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
207 ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
208
209 punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
210 punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
211 punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
212
213 movdqa xmmG,xmmA
214 movdqa xmmH,xmmA
215 punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
216 punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
217
218 psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
219 psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
220
221 movdqa xmmC,xmmD
222 movdqa xmmB,xmmD
223 punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
224 punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
225
226 psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
227
228 movdqa xmmF,xmmE
229 punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
230 punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
231
232 pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
233 movdqa xmmB,xmmE
234 punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
235 punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
236 punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
237
238 pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
239 movdqa xmmB,xmmF
240 punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
241 punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
242 punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
243
244 punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
245 punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
246 punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
247
248 cmp rcx, byte SIZEOF_XMMWORD
249 jb short .column_st32
250
251 test rdi, SIZEOF_XMMWORD-1
252 jnz short .out1
253 ; --(aligned)-------------------
254 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
255 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
256 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
257 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
258 jmp short .out0
259.out1: ; --(unaligned)-----------------
260 pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
261 maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
262 add rdi, byte SIZEOF_XMMWORD ; outptr
263 maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
264 add rdi, byte SIZEOF_XMMWORD ; outptr
265 maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
266 add rdi, byte SIZEOF_XMMWORD ; outptr
267.out0:
268 sub rcx, byte SIZEOF_XMMWORD
269 jz near .endcolumn
270
271 add rsi, byte SIZEOF_XMMWORD ; inptr0
272 dec al ; Yctr
273 jnz near .Yloop_2nd
274
275 add rbx, byte SIZEOF_XMMWORD ; inptr1
276 add rdx, byte SIZEOF_XMMWORD ; inptr2
277 jmp near .columnloop
278
279.column_st32:
280 pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
281 lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
282 cmp rcx, byte 2*SIZEOF_XMMWORD
283 jb short .column_st16
284 maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
285 add rdi, byte SIZEOF_XMMWORD ; outptr
286 maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
287 add rdi, byte SIZEOF_XMMWORD ; outptr
288 movdqa xmmA,xmmF
289 sub rcx, byte 2*SIZEOF_XMMWORD
290 jmp short .column_st15
291.column_st16:
292 cmp rcx, byte SIZEOF_XMMWORD
293 jb short .column_st15
294 maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
295 add rdi, byte SIZEOF_XMMWORD ; outptr
296 movdqa xmmA,xmmD
297 sub rcx, byte SIZEOF_XMMWORD
298.column_st15:
299 mov rax,rcx
300 xor rcx, byte 0x0F
301 shl rcx, 2
302 movd xmmB,ecx
303 psrlq xmmH,4
304 pcmpeqb xmmE,xmmE
305 psrlq xmmH,xmmB
306 psrlq xmmE,xmmB
307 punpcklbw xmmE,xmmH
308 ; ----------------
309 mov rcx,rdi
310 and rcx, byte SIZEOF_XMMWORD-1
311 jz short .adj0
312 add rax,rcx
313 cmp rax, byte SIZEOF_XMMWORD
314 ja short .adj0
315 and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
316 shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
317 movdqa xmmG,xmmA
318 movdqa xmmC,xmmE
319 pslldq xmmA, SIZEOF_XMMWORD/2
320 pslldq xmmE, SIZEOF_XMMWORD/2
321 movd xmmD,ecx
322 sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
323 jb short .adj1
324 movd xmmF,ecx
325 psllq xmmA,xmmF
326 psllq xmmE,xmmF
327 jmp short .adj0
328.adj1: neg rcx
329 movd xmmF,ecx
330 psrlq xmmA,xmmF
331 psrlq xmmE,xmmF
332 psllq xmmG,xmmD
333 psllq xmmC,xmmD
334 por xmmA,xmmG
335 por xmmE,xmmC
336.adj0: ; ----------------
337 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
338
339%else ; RGB_PIXELSIZE == 4 ; -----------
340
341%ifdef RGBX_FILLER_0XFF
342 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
343 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
344%else
345 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
346 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
347%endif
348 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
349 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
350 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
351 ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
352
353 punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
354 punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
355 punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
356 punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
357
358 movdqa xmmC,xmmA
359 punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
360 punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
361 movdqa xmmG,xmmB
362 punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
363 punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
364
365 movdqa xmmD,xmmA
366 punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
367 punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
368 movdqa xmmH,xmmC
369 punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
370 punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
371
372 cmp rcx, byte SIZEOF_XMMWORD
373 jb short .column_st32
374
375 test rdi, SIZEOF_XMMWORD-1
376 jnz short .out1
377 ; --(aligned)-------------------
378 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
379 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
380 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
381 movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
382 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
383 jmp short .out0
384.out1: ; --(unaligned)-----------------
385 pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
386 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
387 add rdi, byte SIZEOF_XMMWORD ; outptr
388 maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
389 add rdi, byte SIZEOF_XMMWORD ; outptr
390 maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
391 add rdi, byte SIZEOF_XMMWORD ; outptr
392 maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
393 add rdi, byte SIZEOF_XMMWORD ; outptr
394.out0:
395 sub rcx, byte SIZEOF_XMMWORD
396 jz near .endcolumn
397
398 add rsi, byte SIZEOF_XMMWORD ; inptr0
399 dec al ; Yctr
400 jnz near .Yloop_2nd
401
402 add rbx, byte SIZEOF_XMMWORD ; inptr1
403 add rdx, byte SIZEOF_XMMWORD ; inptr2
404 jmp near .columnloop
405
406.column_st32:
407 pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
408 cmp rcx, byte SIZEOF_XMMWORD/2
409 jb short .column_st16
410 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
411 add rdi, byte SIZEOF_XMMWORD ; outptr
412 maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
413 add rdi, byte SIZEOF_XMMWORD ; outptr
414 movdqa xmmA,xmmC
415 movdqa xmmD,xmmH
416 sub rcx, byte SIZEOF_XMMWORD/2
417.column_st16:
418 cmp rcx, byte SIZEOF_XMMWORD/4
419 jb short .column_st15
420 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
421 add rdi, byte SIZEOF_XMMWORD ; outptr
422 movdqa xmmA,xmmD
423 sub rcx, byte SIZEOF_XMMWORD/4
424.column_st15:
425 cmp rcx, byte SIZEOF_XMMWORD/16
426 jb near .endcolumn
427 mov rax,rcx
428 xor rcx, byte 0x03
429 inc rcx
430 shl rcx, 4
431 movd xmmF,ecx
432 psrlq xmmE,xmmF
433 punpcklbw xmmE,xmmE
434 ; ----------------
435 mov rcx,rdi
436 and rcx, byte SIZEOF_XMMWORD-1
437 jz short .adj0
438 lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
439 cmp rax, byte SIZEOF_XMMWORD
440 ja short .adj0
441 and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
442 shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
443 movdqa xmmB,xmmA
444 movdqa xmmG,xmmE
445 pslldq xmmA, SIZEOF_XMMWORD/2
446 pslldq xmmE, SIZEOF_XMMWORD/2
447 movd xmmC,ecx
448 sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
449 jb short .adj1
450 movd xmmH,ecx
451 psllq xmmA,xmmH
452 psllq xmmE,xmmH
453 jmp short .adj0
454.adj1: neg rcx
455 movd xmmH,ecx
456 psrlq xmmA,xmmH
457 psrlq xmmE,xmmH
458 psllq xmmB,xmmC
459 psllq xmmG,xmmC
460 por xmmA,xmmB
461 por xmmE,xmmG
462.adj0: ; ----------------
463 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
464
465%endif ; RGB_PIXELSIZE ; ---------------
466
467.endcolumn:
468 sfence ; flush the write buffer
469
470.return:
471 uncollect_args
472 pop rbx
473 mov rsp,rbp ; rsp <- aligned rbp
474 pop rsp ; rsp <- original rbp
475 pop rbp
476 ret
477
478; --------------------------------------------------------------------------
479;
480; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
481;
482; GLOBAL(void)
483; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
484; JSAMPIMAGE input_buf,
485; JDIMENSION in_row_group_ctr,
486; JSAMPARRAY output_buf);
487;
488
489; r10 = JDIMENSION output_width
490; r11 = JSAMPIMAGE input_buf
491; r12 = JDIMENSION in_row_group_ctr
492; r13 = JSAMPARRAY output_buf
493
494 align 16
495 global EXTN(jsimd_h2v2_merged_upsample_sse2)
496
497EXTN(jsimd_h2v2_merged_upsample_sse2):
498 push rbp
499 mov rbp,rsp
500 push rbx
501 collect_args
502
503 mov rax, r10
504
505 mov rdi, r11
506 mov rcx, r12
507 mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
508 mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
509 mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
510 mov rdi, r13
511 lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
512
513 push rdx ; inptr2
514 push rbx ; inptr1
515 push rsi ; inptr00
516 mov rbx,rsp
517
518 push rdi
519 push rcx
520 push rax
521
522 mov rdx, rcx
523 mov rcx, rdi
524 mov rdi, rax
525 mov rsi, rbx
526
527 call EXTN(jsimd_h2v1_merged_upsample_sse2)
528
529 pop rax
530 pop rcx
531 pop rdi
532 pop rsi
533 pop rbx
534 pop rdx
535
536 add rdi, byte SIZEOF_JSAMPROW ; outptr1
537 add rsi, byte SIZEOF_JSAMPROW ; inptr01
538
539 push rdx ; inptr2
540 push rbx ; inptr1
541 push rsi ; inptr00
542 mov rbx,rsp
543
544 push rdi
545 push rcx
546 push rax
547
548 mov rdx, rcx
549 mov rcx, rdi
550 mov rdi, rax
551 mov rsi, rbx
552
553 call EXTN(jsimd_h2v1_merged_upsample_sse2)
554
555 pop rax
556 pop rcx
557 pop rdi
558 pop rsi
559 pop rbx
560 pop rdx
561
562 uncollect_args
563 pop rbx
564 pop rbp
565 ret