blob: b3ef195b6beb6536b969c72f5898def814d07793 [file] [log] [blame]
DRC8ca81ec2009-04-03 12:00:51 +00001;
2; jdclrss2.asm - colorspace conversion (SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19; --------------------------------------------------------------------------
20 SECTION SEG_TEXT
21 BITS 32
22;
23; Convert some rows of samples to the output colorspace.
24;
25; GLOBAL(void)
26; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
27; JSAMPIMAGE input_buf, JDIMENSION input_row,
28; JSAMPARRAY output_buf, int num_rows)
29;
30
31%define out_width(b) (b)+8 ; JDIMENSION out_width
32%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
33%define input_row(b) (b)+16 ; JDIMENSION input_row
34%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
35%define num_rows(b) (b)+24 ; int num_rows
36
37%define original_ebp ebp+0
38%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
39%define WK_NUM 2
40%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
41
42 align 16
43 global EXTN(jsimd_ycc_rgb_convert_sse2)
44
45EXTN(jsimd_ycc_rgb_convert_sse2):
46 push ebp
47 mov eax,esp ; eax = original ebp
48 sub esp, byte 4
49 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
50 mov [esp],eax
51 mov ebp,esp ; ebp = aligned ebp
52 lea esp, [wk(0)]
53 pushpic eax ; make a room for GOT address
54 push ebx
55; push ecx ; need not be preserved
56; push edx ; need not be preserved
57 push esi
58 push edi
59
60 get_GOT ebx ; get GOT address
61 movpic POINTER [gotptr], ebx ; save GOT address
62
63 mov ecx, JDIMENSION [out_width(eax)] ; num_cols
64 test ecx,ecx
65 jz near .return
66
67 push ecx
68
69 mov edi, JSAMPIMAGE [input_buf(eax)]
70 mov ecx, JDIMENSION [input_row(eax)]
71 mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
72 mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
73 mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
74 lea esi, [esi+ecx*SIZEOF_JSAMPROW]
75 lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
76 lea edx, [edx+ecx*SIZEOF_JSAMPROW]
77
78 pop ecx
79
80 mov edi, JSAMPARRAY [output_buf(eax)]
81 mov eax, INT [num_rows(eax)]
82 test eax,eax
83 jle near .return
84 alignx 16,7
85.rowloop:
86 push eax
87 push edi
88 push edx
89 push ebx
90 push esi
91 push ecx ; col
92
93 mov esi, JSAMPROW [esi] ; inptr0
94 mov ebx, JSAMPROW [ebx] ; inptr1
95 mov edx, JSAMPROW [edx] ; inptr2
96 mov edi, JSAMPROW [edi] ; outptr
97 movpic eax, POINTER [gotptr] ; load GOT address (eax)
98 alignx 16,7
99.columnloop:
100
101 movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
102 movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
103
104 pcmpeqw xmm4,xmm4
105 pcmpeqw xmm7,xmm7
106 psrlw xmm4,BYTE_BIT
107 psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
108 movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
109
110 pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE
111 psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
112 pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE
113 psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
114
115 paddw xmm4,xmm7
116 paddw xmm5,xmm7
117 paddw xmm0,xmm7
118 paddw xmm1,xmm7
119
120 ; (Original)
121 ; R = Y + 1.40200 * Cr
122 ; G = Y - 0.34414 * Cb - 0.71414 * Cr
123 ; B = Y + 1.77200 * Cb
124 ;
125 ; (This implementation)
126 ; R = Y + 0.40200 * Cr + Cr
127 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
128 ; B = Y - 0.22800 * Cb + Cb + Cb
129
130 movdqa xmm2,xmm4 ; xmm2=CbE
131 movdqa xmm3,xmm5 ; xmm3=CbO
132 paddw xmm4,xmm4 ; xmm4=2*CbE
133 paddw xmm5,xmm5 ; xmm5=2*CbO
134 movdqa xmm6,xmm0 ; xmm6=CrE
135 movdqa xmm7,xmm1 ; xmm7=CrO
136 paddw xmm0,xmm0 ; xmm0=2*CrE
137 paddw xmm1,xmm1 ; xmm1=2*CrO
138
139 pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
140 pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
141 pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
142 pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
143
144 paddw xmm4,[GOTOFF(eax,PW_ONE)]
145 paddw xmm5,[GOTOFF(eax,PW_ONE)]
146 psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800))
147 psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800))
148 paddw xmm0,[GOTOFF(eax,PW_ONE)]
149 paddw xmm1,[GOTOFF(eax,PW_ONE)]
150 psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200))
151 psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200))
152
153 paddw xmm4,xmm2
154 paddw xmm5,xmm3
155 paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
156 paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
157 paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
158 paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
159
160 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
161 movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
162
163 movdqa xmm4,xmm2
164 movdqa xmm5,xmm3
165 punpcklwd xmm2,xmm6
166 punpckhwd xmm4,xmm6
167 pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
168 pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
169 punpcklwd xmm3,xmm7
170 punpckhwd xmm5,xmm7
171 pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
172 pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
173
174 paddd xmm2,[GOTOFF(eax,PD_ONEHALF)]
175 paddd xmm4,[GOTOFF(eax,PD_ONEHALF)]
176 psrad xmm2,SCALEBITS
177 psrad xmm4,SCALEBITS
178 paddd xmm3,[GOTOFF(eax,PD_ONEHALF)]
179 paddd xmm5,[GOTOFF(eax,PD_ONEHALF)]
180 psrad xmm3,SCALEBITS
181 psrad xmm5,SCALEBITS
182
183 packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
184 packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
185 psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
186 psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
187
188 movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
189
190 pcmpeqw xmm4,xmm4
191 psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
192 pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE
193 psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO
194
195 paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
196 paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
197 packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
198 packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
199
200 paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
201 paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
202 packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
203 packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
204
205 paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
206 paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
207 packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
208 packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
209
210%if RGB_PIXELSIZE == 3 ; ---------------
211
212 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
213 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
214 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
215 ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
216
217 punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
218 punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
219 punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
220
221 movdqa xmmG,xmmA
222 movdqa xmmH,xmmA
223 punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
224 punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
225
226 psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
227 psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
228
229 movdqa xmmC,xmmD
230 movdqa xmmB,xmmD
231 punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
232 punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
233
234 psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
235
236 movdqa xmmF,xmmE
237 punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
238 punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
239
240 pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
241 movdqa xmmB,xmmE
242 punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
243 punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
244 punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
245
246 pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
247 movdqa xmmB,xmmF
248 punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
249 punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
250 punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
251
252 punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
253 punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
254 punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
255
256 cmp ecx, byte SIZEOF_XMMWORD
257 jb short .column_st32
258
259 test edi, SIZEOF_XMMWORD-1
260 jnz short .out1
261 ; --(aligned)-------------------
262 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
263 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
264 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
265 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
266 jmp short .out0
267.out1: ; --(unaligned)-----------------
268 pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
269 maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
270 add edi, byte SIZEOF_XMMWORD ; outptr
271 maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
272 add edi, byte SIZEOF_XMMWORD ; outptr
273 maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
274 add edi, byte SIZEOF_XMMWORD ; outptr
275.out0:
276 sub ecx, byte SIZEOF_XMMWORD
277 jz near .nextrow
278
279 add esi, byte SIZEOF_XMMWORD ; inptr0
280 add ebx, byte SIZEOF_XMMWORD ; inptr1
281 add edx, byte SIZEOF_XMMWORD ; inptr2
282 jmp near .columnloop
283 alignx 16,7
284
285.column_st32:
286 pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
287 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
288 cmp ecx, byte 2*SIZEOF_XMMWORD
289 jb short .column_st16
290 maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
291 add edi, byte SIZEOF_XMMWORD ; outptr
292 maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
293 add edi, byte SIZEOF_XMMWORD ; outptr
294 movdqa xmmA,xmmF
295 sub ecx, byte 2*SIZEOF_XMMWORD
296 jmp short .column_st15
297.column_st16:
298 cmp ecx, byte SIZEOF_XMMWORD
299 jb short .column_st15
300 maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
301 add edi, byte SIZEOF_XMMWORD ; outptr
302 movdqa xmmA,xmmD
303 sub ecx, byte SIZEOF_XMMWORD
304.column_st15:
305 mov eax,ecx
306 xor ecx, byte 0x0F
307 shl ecx, 2
308 movd xmmB,ecx
309 psrlq xmmH,4
310 pcmpeqb xmmE,xmmE
311 psrlq xmmH,xmmB
312 psrlq xmmE,xmmB
313 punpcklbw xmmE,xmmH
314 ; ----------------
315 mov ecx,edi
316 and ecx, byte SIZEOF_XMMWORD-1
317 jz short .adj0
318 add eax,ecx
319 cmp eax, byte SIZEOF_XMMWORD
320 ja short .adj0
321 and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
322 shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
323 movdqa xmmG,xmmA
324 movdqa xmmC,xmmE
325 pslldq xmmA, SIZEOF_XMMWORD/2
326 pslldq xmmE, SIZEOF_XMMWORD/2
327 movd xmmD,ecx
328 sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
329 jb short .adj1
330 movd xmmF,ecx
331 psllq xmmA,xmmF
332 psllq xmmE,xmmF
333 jmp short .adj0
334.adj1: neg ecx
335 movd xmmF,ecx
336 psrlq xmmA,xmmF
337 psrlq xmmE,xmmF
338 psllq xmmG,xmmD
339 psllq xmmC,xmmD
340 por xmmA,xmmG
341 por xmmE,xmmC
342.adj0: ; ----------------
343 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
344
345%else ; RGB_PIXELSIZE == 4 ; -----------
346
347%ifdef RGBX_FILLER_0XFF
348 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
349 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
350%else
351 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
352 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
353%endif
354 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
355 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
356 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
357 ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
358
359 punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
360 punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
361 punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
362 punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
363
364 movdqa xmmC,xmmA
365 punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
366 punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
367 movdqa xmmG,xmmB
368 punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
369 punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
370
371 movdqa xmmD,xmmA
372 punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
373 punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
374 movdqa xmmH,xmmC
375 punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
376 punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
377
378 cmp ecx, byte SIZEOF_XMMWORD
379 jb short .column_st32
380
381 test edi, SIZEOF_XMMWORD-1
382 jnz short .out1
383 ; --(aligned)-------------------
384 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
385 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
386 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
387 movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
388 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
389 jmp short .out0
390.out1: ; --(unaligned)-----------------
391 pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
392 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
393 add edi, byte SIZEOF_XMMWORD ; outptr
394 maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
395 add edi, byte SIZEOF_XMMWORD ; outptr
396 maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
397 add edi, byte SIZEOF_XMMWORD ; outptr
398 maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
399 add edi, byte SIZEOF_XMMWORD ; outptr
400.out0:
401 sub ecx, byte SIZEOF_XMMWORD
402 jz near .nextrow
403
404 add esi, byte SIZEOF_XMMWORD ; inptr0
405 add ebx, byte SIZEOF_XMMWORD ; inptr1
406 add edx, byte SIZEOF_XMMWORD ; inptr2
407 jmp near .columnloop
408 alignx 16,7
409
410.column_st32:
411 pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
412 cmp ecx, byte SIZEOF_XMMWORD/2
413 jb short .column_st16
414 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
415 add edi, byte SIZEOF_XMMWORD ; outptr
416 maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
417 add edi, byte SIZEOF_XMMWORD ; outptr
418 movdqa xmmA,xmmC
419 movdqa xmmD,xmmH
420 sub ecx, byte SIZEOF_XMMWORD/2
421.column_st16:
422 cmp ecx, byte SIZEOF_XMMWORD/4
423 jb short .column_st15
424 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
425 add edi, byte SIZEOF_XMMWORD ; outptr
426 movdqa xmmA,xmmD
427 sub ecx, byte SIZEOF_XMMWORD/4
428.column_st15:
429 cmp ecx, byte SIZEOF_XMMWORD/16
430 jb short .nextrow
431 mov eax,ecx
432 xor ecx, byte 0x03
433 inc ecx
434 shl ecx, 4
435 movd xmmF,ecx
436 psrlq xmmE,xmmF
437 punpcklbw xmmE,xmmE
438 ; ----------------
439 mov ecx,edi
440 and ecx, byte SIZEOF_XMMWORD-1
441 jz short .adj0
442 lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
443 cmp eax, byte SIZEOF_XMMWORD
444 ja short .adj0
445 and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
446 shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
447 movdqa xmmB,xmmA
448 movdqa xmmG,xmmE
449 pslldq xmmA, SIZEOF_XMMWORD/2
450 pslldq xmmE, SIZEOF_XMMWORD/2
451 movd xmmC,ecx
452 sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
453 jb short .adj1
454 movd xmmH,ecx
455 psllq xmmA,xmmH
456 psllq xmmE,xmmH
457 jmp short .adj0
458.adj1: neg ecx
459 movd xmmH,ecx
460 psrlq xmmA,xmmH
461 psrlq xmmE,xmmH
462 psllq xmmB,xmmC
463 psllq xmmG,xmmC
464 por xmmA,xmmB
465 por xmmE,xmmG
466.adj0: ; ----------------
467 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
468
469%endif ; RGB_PIXELSIZE ; ---------------
470
471 alignx 16,7
472
473.nextrow:
474 pop ecx
475 pop esi
476 pop ebx
477 pop edx
478 pop edi
479 pop eax
480
481 add esi, byte SIZEOF_JSAMPROW
482 add ebx, byte SIZEOF_JSAMPROW
483 add edx, byte SIZEOF_JSAMPROW
484 add edi, byte SIZEOF_JSAMPROW ; output_buf
485 dec eax ; num_rows
486 jg near .rowloop
487
488 sfence ; flush the write buffer
489
490.return:
491 pop edi
492 pop esi
493; pop edx ; need not be preserved
494; pop ecx ; need not be preserved
495 pop ebx
496 mov esp,ebp ; esp <- aligned ebp
497 pop esp ; esp <- original ebp
498 pop ebp
499 ret
500