Add SSE2 SIMD implementation of computationally intensive routines.
git-svn-id: svn://svn.code.sf.net/p/tigervnc/code/trunk@3653 3789f03b-4d11-0410-bbf8-ca57d06f2519
diff --git a/common/jpeg/simd/jsimd.h b/common/jpeg/simd/jsimd.h
index 98bcebc..371586e 100644
--- a/common/jpeg/simd/jsimd.h
+++ b/common/jpeg/simd/jsimd.h
@@ -14,6 +14,7 @@
#define JSIMD_MMX 0x01
#define JSIMD_3DNOW 0x02
#define JSIMD_SSE 0x04
+#define JSIMD_SSE2 0x08
/* Short forms of external names for systems with brain-damaged linkers. */
@@ -21,32 +22,63 @@
#define jpeg_simd_cpu_support jSiCpuSupport
#define jsimd_rgb_ycc_convert_mmx jSRGBYCCM
#define jsimd_ycc_rgb_convert_mmx jSYCCRGBM
+#define jconst_rgb_ycc_convert_sse2 jSCRGBYCCS2
+#define jsimd_rgb_ycc_convert_sse2 jSRGBYCCS2
+#define jconst_ycc_rgb_convert_sse2 jSCYCCRGBS2
+#define jsimd_ycc_rgb_convert_sse2 jSYCCRGBS2
#define jsimd_h2v2_downsample_mmx jSDnH2V2M
#define jsimd_h2v1_downsample_mmx jSDnH2V1M
+#define jsimd_h2v2_downsample_sse2 jSDnH2V2S2
+#define jsimd_h2v1_downsample_sse2 jSDnH2V1S2
#define jsimd_h2v2_upsample_mmx jSUpH2V2M
#define jsimd_h2v1_upsample_mmx jSUpH2V1M
#define jsimd_h2v2_fancy_upsample_mmx jSFUpH2V2M
#define jsimd_h2v1_fancy_upsample_mmx jSFUpH2V1M
#define jsimd_h2v2_merged_upsample_mmx jSMUpH2V2M
#define jsimd_h2v1_merged_upsample_mmx jSMUpH2V1M
+#define jsimd_h2v2_upsample_sse2 jSUpH2V2S2
+#define jsimd_h2v1_upsample_sse2 jSUpH2V1S2
+#define jconst_fancy_upsample_sse2 jSCFUpS2
+#define jsimd_h2v2_fancy_upsample_sse2 jSFUpH2V2S2
+#define jsimd_h2v1_fancy_upsample_sse2 jSFUpH2V1S2
+#define jconst_merged_upsample_sse2 jSCMUpS2
+#define jsimd_h2v2_merged_upsample_sse2 jSMUpH2V2S2
+#define jsimd_h2v1_merged_upsample_sse2 jSMUpH2V1S2
#define jsimd_convsamp_mmx jSConvM
+#define jsimd_convsamp_sse2 jSConvS2
#define jsimd_convsamp_float_3dnow jSConvF3D
#define jsimd_convsamp_float_sse jSConvFS
+#define jsimd_convsamp_float_sse2 jSConvFS2
#define jsimd_fdct_islow_mmx jSFDMIS
#define jsimd_fdct_ifast_mmx jSFDMIF
+#define jconst_fdct_islow_sse2 jSCFDS2IS
+#define jsimd_fdct_islow_sse2 jSFDS2IS
+#define jconst_fdct_ifast_sse2 jSCFDS2IF
+#define jsimd_fdct_ifast_sse2 jSFDS2IF
#define jsimd_fdct_float_3dnow jSFD3DF
#define jconst_fdct_float_sse jSCFDSF
#define jsimd_fdct_float_sse jSFDSF
#define jsimd_quantize_mmx jSQuantM
+#define jsimd_quantize_sse2 jSQuantS2
#define jsimd_quantize_float_3dnow jSQuantF3D
#define jsimd_quantize_float_sse jSQuantFS
+#define jsimd_quantize_float_sse2 jSQuantFS2
#define jsimd_idct_2x2_mmx jSIDM22
#define jsimd_idct_4x4_mmx jSIDM44
+#define jconst_idct_red_sse2 jSCIDS2R
+#define jsimd_idct_2x2_sse2 jSIDS222
+#define jsimd_idct_4x4_sse2 jSIDS244
#define jsimd_idct_islow_mmx jSIDMIS
#define jsimd_idct_ifast_mmx jSIDMIF
+#define jconst_idct_islow_sse2 jSCIDS2IS
+#define jsimd_idct_islow_sse2 jSIDS2IS
+#define jconst_idct_ifast_sse2 jSCIDS2IF
+#define jsimd_idct_ifast_sse2 jSIDS2IF
#define jsimd_idct_float_3dnow jSID3DF
#define jconst_fdct_float_sse jSCIDSF
#define jsimd_idct_float_sse jSIDSF
+#define jconst_fdct_float_sse2 jSCIDS2F
+#define jsimd_idct_float_sse2 jSIDS2F
#endif /* NEED_SHORT_EXTERNAL_NAMES */
/* SIMD Ext: retrieve SIMD/CPU information */
@@ -62,6 +94,17 @@
JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows));
+extern const int jconst_rgb_ycc_convert_sse2[];
+EXTERN(void) jsimd_rgb_ycc_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+extern const int jconst_ycc_rgb_convert_sse2[];
+EXTERN(void) jsimd_ycc_rgb_convert_sse2
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+
/* SIMD Downsample */
EXTERN(void) jsimd_h2v2_downsample_mmx
JPP((JDIMENSION image_width, int max_v_samp_factor,
@@ -72,6 +115,15 @@
JDIMENSION v_samp_factor, JDIMENSION width_blocks,
JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v2_downsample_sse2
+ JPP((JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v1_downsample_sse2
+ JPP((JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data));
+
/* SIMD Upsample */
EXTERN(void) jsimd_h2v2_upsample_mmx
JPP((int max_v_samp_factor, JDIMENSION output_width,
@@ -94,11 +146,38 @@
JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_upsample_sse2
+ JPP((int max_v_samp_factor, JDIMENSION output_width,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_upsample_sse2
+ JPP((int max_v_samp_factor, JDIMENSION output_width,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+extern const int jconst_fancy_upsample_sse2[];
+EXTERN(void) jsimd_h2v2_fancy_upsample_sse2
+ JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_fancy_upsample_sse2
+ JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+extern const int jconst_merged_upsample_sse2[];
+EXTERN(void) jsimd_h2v2_merged_upsample_sse2
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_merged_upsample_sse2
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+
/* SIMD Sample Conversion */
EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
JDIMENSION start_col,
DCTELEM * workspace));
+EXTERN(void) jsimd_convsamp_sse2 JPP((JSAMPARRAY sample_data,
+ JDIMENSION start_col,
+ DCTELEM * workspace));
+
EXTERN(void) jsimd_convsamp_float_3dnow JPP((JSAMPARRAY sample_data,
JDIMENSION start_col,
FAST_FLOAT * workspace));
@@ -107,10 +186,19 @@
JDIMENSION start_col,
FAST_FLOAT * workspace));
+EXTERN(void) jsimd_convsamp_float_sse2 JPP((JSAMPARRAY sample_data,
+ JDIMENSION start_col,
+ FAST_FLOAT * workspace));
+
/* SIMD Forward DCT */
EXTERN(void) jsimd_fdct_islow_mmx JPP((DCTELEM * data));
EXTERN(void) jsimd_fdct_ifast_mmx JPP((DCTELEM * data));
+extern const int jconst_fdct_ifast_sse2[];
+EXTERN(void) jsimd_fdct_islow_sse2 JPP((DCTELEM * data));
+extern const int jconst_fdct_islow_sse2[];
+EXTERN(void) jsimd_fdct_ifast_sse2 JPP((DCTELEM * data));
+
EXTERN(void) jsimd_fdct_float_3dnow JPP((FAST_FLOAT * data));
extern const int jconst_fdct_float_sse[];
@@ -121,6 +209,10 @@
DCTELEM * divisors,
DCTELEM * workspace));
+EXTERN(void) jsimd_quantize_sse2 JPP((JCOEFPTR coef_block,
+ DCTELEM * divisors,
+ DCTELEM * workspace));
+
EXTERN(void) jsimd_quantize_float_3dnow JPP((JCOEFPTR coef_block,
FAST_FLOAT * divisors,
FAST_FLOAT * workspace));
@@ -129,6 +221,10 @@
FAST_FLOAT * divisors,
FAST_FLOAT * workspace));
+EXTERN(void) jsimd_quantize_float_sse2 JPP((JCOEFPTR coef_block,
+ FAST_FLOAT * divisors,
+ FAST_FLOAT * workspace));
+
/* SIMD Reduced Inverse DCT */
EXTERN(void) jsimd_idct_2x2_mmx JPP((void * dct_table,
JCOEFPTR coef_block,
@@ -139,6 +235,16 @@
JSAMPARRAY output_buf,
JDIMENSION output_col));
+extern const int jconst_idct_red_sse2[];
+EXTERN(void) jsimd_idct_2x2_sse2 JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4_sse2 JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+
/* SIMD Inverse DCT */
EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table,
JCOEFPTR coef_block,
@@ -149,6 +255,17 @@
JSAMPARRAY output_buf,
JDIMENSION output_col));
+extern const int jconst_idct_islow_sse2[];
+EXTERN(void) jsimd_idct_islow_sse2 JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+extern const int jconst_idct_ifast_sse2[];
+EXTERN(void) jsimd_idct_ifast_sse2 JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+
EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table,
JCOEFPTR coef_block,
JSAMPARRAY output_buf,
@@ -160,3 +277,9 @@
JSAMPARRAY output_buf,
JDIMENSION output_col));
+extern const int jconst_idct_float_sse2[];
+EXTERN(void) jsimd_idct_float_sse2 JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+