In order to use the functions provided by this module, you need to import this module:

```
>>> import HardwareIntrinsics
```

These intrinsic functions are only available if your CPU supports `Sse2`

features.

`mm_add_epi16`

Add packed 16-bit integers in "a" and "b", and store the results in "dst".

__m128i _mm_add_epi16 (__m128i a, __m128i b) PADDW xmm, xmm/m128

`mm_add_epi32`

Add packed 32-bit integers in "a" and "b", and store the results in "dst".

__m128i _mm_add_epi32 (__m128i a, __m128i b) PADDD xmm, xmm/m128

`mm_add_epi64`

Add packed 64-bit integers in "a" and "b", and store the results in "dst".

__m128i _mm_add_epi64 (__m128i a, __m128i b) PADDQ xmm, xmm/m128

`mm_add_epi8`

Add packed 8-bit integers in "a" and "b", and store the results in "dst".

__m128i _mm_add_epi8 (__m128i a, __m128i b) PADDB xmm, xmm/m128

`mm_add_pd`

Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".

__m128d _mm_add_pd (__m128d a, __m128d b) ADDPD xmm, xmm/m128

`mm_add_sd`

Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_add_sd (__m128d a, __m128d b) ADDSD xmm, xmm/m64

`mm_adds_epi16`

Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".

__m128i _mm_adds_epi16 (__m128i a, __m128i b) PADDSW xmm, xmm/m128

`mm_adds_epi8`

Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".

__m128i _mm_adds_epi8 (__m128i a, __m128i b) PADDSB xmm, xmm/m128

`mm_adds_epu16`

Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".

__m128i _mm_adds_epu16 (__m128i a, __m128i b) PADDUSW xmm, xmm/m128

`mm_adds_epu8`

Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".

__m128i _mm_adds_epu8 (__m128i a, __m128i b) PADDUSB xmm, xmm/m128

`mm_and_pd`

Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".

__m128d _mm_and_pd (__m128d a, __m128d b) ANDPD xmm, xmm/m128

`mm_and_si128`

Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".

__m128i _mm_and_si128 (__m128i a, __m128i b) PAND xmm, xmm/m128

`mm_andnot_pd`

Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".

__m128d _mm_andnot_pd (__m128d a, __m128d b) ADDNPD xmm, xmm/m128

`mm_andnot_si128`

Compute the bitwise NOT of 128 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".

__m128i _mm_andnot_si128 (__m128i a, __m128i b) PANDN xmm, xmm/m128

`mm_avg_epu16`

Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".

__m128i _mm_avg_epu16 (__m128i a, __m128i b) PAVGW xmm, xmm/m128

`mm_avg_epu8`

Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".

__m128i _mm_avg_epu8 (__m128i a, __m128i b) PAVGB xmm, xmm/m128

`mm_bslli_si128`

Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".

__m128i _mm_bslli_si128 (__m128i a, int imm8) PSLLDQ xmm, imm8

`mm_bsrli_si128`

Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".

__m128i _mm_bsrli_si128 (__m128i a, int imm8) PSRLDQ xmm, imm8

`mm_cmpeq_epi16`

Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".

__m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) PCMPEQW xmm, xmm/m128

`mm_cmpeq_epi32`

Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".

__m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) PCMPEQD xmm, xmm/m128

`mm_cmpeq_epi8`

Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".

__m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) PCMPEQB xmm, xmm/m128

`mm_cmpeq_pd`

Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst".

__m128d _mm_cmpeq_pd (__m128d a, __m128d b) CMPPD xmm, xmm/m128, imm8(0)

`mm_cmpeq_sd`

Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_cmpeq_sd (__m128d a, __m128d b) CMPSD xmm, xmm/m64, imm8(0)

`mm_cmpge_pd`

Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst".

__m128d _mm_cmpge_pd (__m128d a, __m128d b) CMPPD xmm, xmm/m128, imm8(5)

`mm_cmpge_sd`

Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_cmpge_sd (__m128d a, __m128d b) CMPSD xmm, xmm/m64, imm8(5)

`mm_cmpgt_epi16`

Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".

__m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) PCMPGTW xmm, xmm/m128

`mm_cmpgt_epi32`

Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".

__m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) PCMPGTD xmm, xmm/m128

`mm_cmpgt_epi8`

Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".

__m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) PCMPGTB xmm, xmm/m128

`mm_cmpgt_pd`

Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst".

__m128d _mm_cmpgt_pd (__m128d a, __m128d b) CMPPD xmm, xmm/m128, imm8(6)

`mm_cmpgt_sd`

Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_cmpgt_sd (__m128d a, __m128d b) CMPSD xmm, xmm/m64, imm8(6)

`mm_cmple_pd`

Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst".

__m128d _mm_cmple_pd (__m128d a, __m128d b) CMPPD xmm, xmm/m128, imm8(2)

`mm_cmple_sd`

Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_cmple_sd (__m128d a, __m128d b) CMPSD xmm, xmm/m64, imm8(2)

`mm_cmplt_epi16`

Compare packed 16-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtw instruction with the order of the operands switched.

__m128i _mm_cmplt_epi16 (__m128i a, __m128i b) PCMPGTW xmm, xmm/m128

`mm_cmplt_epi32`

Compare packed 32-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtd instruction with the order of the operands switched.

__m128i _mm_cmplt_epi32 (__m128i a, __m128i b) PCMPGTD xmm, xmm/m128

`mm_cmplt_epi8`

Compare packed 8-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtb instruction with the order of the operands switched.

__m128i _mm_cmplt_epi8 (__m128i a, __m128i b) PCMPGTB xmm, xmm/m128

`mm_cmplt_pd`

Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst".

__m128d _mm_cmplt_pd (__m128d a, __m128d b) CMPPD xmm, xmm/m128, imm8(1)

`mm_cmplt_sd`

Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_cmplt_sd (__m128d a, __m128d b) CMPSD xmm, xmm/m64, imm8(1)

`mm_cmpneq_pd`

Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst".

__m128d _mm_cmpneq_pd (__m128d a, __m128d b) CMPPD xmm, xmm/m128, imm8(4)

`mm_cmpneq_sd`

Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_cmpneq_sd (__m128d a, __m128d b) CMPSD xmm, xmm/m64, imm8(4)

`mm_cmpnge_pd`

Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst".

__m128d _mm_cmpnge_pd (__m128d a, __m128d b) CMPPD xmm, xmm/m128, imm8(1)

`mm_cmpnge_sd`

Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_cmpnge_sd (__m128d a, __m128d b) CMPSD xmm, xmm/m64, imm8(1)

`mm_cmpngt_pd`

Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst".

__m128d _mm_cmpngt_pd (__m128d a, __m128d b) CMPPD xmm, xmm/m128, imm8(2)

`mm_cmpngt_sd`

Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_cmpngt_sd (__m128d a, __m128d b) CMPSD xmm, xmm/m64, imm8(2)

`mm_cmpnle_pd`

Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst".

__m128d _mm_cmpnle_pd (__m128d a, __m128d b) CMPPD xmm, xmm/m128, imm8(6)

`mm_cmpnle_sd`

Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_cmpnle_sd (__m128d a, __m128d b) CMPSD xmm, xmm/m64, imm8(6)

`mm_cmpnlt_pd`

Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst".

__m128d _mm_cmpnlt_pd (__m128d a, __m128d b) CMPPD xmm, xmm/m128, imm8(5)

`mm_cmpnlt_sd`

Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_cmpnlt_sd (__m128d a, __m128d b) CMPSD xmm, xmm/m64, imm8(5)

`mm_cmpord_pd`

Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst".

__m128d _mm_cmpord_pd (__m128d a, __m128d b) CMPPD xmm, xmm/m128, imm8(7)

`mm_cmpord_sd`

Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_cmpord_sd (__m128d a, __m128d b) CMPSD xmm, xmm/m64, imm8(7)

`mm_cmpunord_pd`

Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst".

__m128d _mm_cmpunord_pd (__m128d a, __m128d b) CMPPD xmm, xmm/m128, imm8(3)

`mm_cmpunord_sd`

Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_cmpunord_sd (__m128d a, __m128d b) CMPSD xmm, xmm/m64, imm8(3)

`mm_comieq_sd`

Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1).

int _mm_comieq_sd (__m128d a, __m128d b) COMISD xmm, xmm/m64

`mm_comige_sd`

Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1).

int _mm_comige_sd (__m128d a, __m128d b) COMISD xmm, xmm/m64

`mm_comigt_sd`

Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1).

int _mm_comigt_sd (__m128d a, __m128d b) COMISD xmm, xmm/m64

`mm_comile_sd`

Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1).

int _mm_comile_sd (__m128d a, __m128d b) COMISD xmm, xmm/m64

`mm_comilt_sd`

Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1).

int _mm_comilt_sd (__m128d a, __m128d b) COMISD xmm, xmm/m64

`mm_comineq_sd`

Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1).

int _mm_comineq_sd (__m128d a, __m128d b) COMISD xmm, xmm/m64

`mm_cvtepi32_pd`

Convert packed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".

__m128d _mm_cvtepi32_pd (__m128i a) CVTDQ2PD xmm, xmm/m128

`mm_cvtepi32_ps`

Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".

__m128 _mm_cvtepi32_ps (__m128i a) CVTDQ2PS xmm, xmm/m128

`mm_cvtpd_epi32`

Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".

__m128i _mm_cvtpd_epi32 (__m128d a) CVTPD2DQ xmm, xmm/m128

`mm_cvtpd_ps`

Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".

__m128 _mm_cvtpd_ps (__m128d a) CVTPD2PS xmm, xmm/m128

`mm_cvtps_epi32`

Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".

__m128i _mm_cvtps_epi32 (__m128 a) CVTPS2DQ xmm, xmm/m128

`mm_cvtps_pd`

Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".

__m128d _mm_cvtps_pd (__m128 a) CVTPS2PD xmm, xmm/m128

`mm_cvtsd_si32`

Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".

int _mm_cvtsd_si32 (__m128d a) CVTSD2SI r32, xmm/m64

`mm_cvtsd_ss`

Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128 _mm_cvtsd_ss (__m128 a, __m128d b) CVTSD2SS xmm, xmm/m64

`mm_cvtsi128_si32`

Copy the lower 32-bit integer in "a" to "dst".

int _mm_cvtsi128_si32 (__m128i a) MOVD reg/m32, xmm

`mm_cvtsi32_sd`

Convert the 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_cvtsi32_sd (__m128d a, int b) CVTSI2SD xmm, reg/m32

`mm_cvtsi32_si128`

Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper elements of "dst".

__m128i _mm_cvtsi32_si128 (int a) MOVD xmm, reg/m32

`mm_cvtss_sd`

Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_cvtss_sd (__m128d a, __m128 b) CVTSS2SD xmm, xmm/m32

`mm_cvttpd_epi32`

Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".

__m128i _mm_cvttpd_epi32 (__m128d a) CVTTPD2DQ xmm, xmm/m128

`mm_cvttps_epi32`

Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".

__m128i _mm_cvttps_epi32 (__m128 a) CVTTPS2DQ xmm, xmm/m128

`mm_cvttsd_si32`

Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".

int _mm_cvttsd_si32 (__m128d a) CVTTSD2SI reg, xmm/m64

`mm_div_pd`

Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".

__m128d _mm_div_pd (__m128d a, __m128d b) DIVPD xmm, xmm/m128

`mm_div_sd`

Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_div_sd (__m128d a, __m128d b) DIVSD xmm, xmm/m64

`mm_extract_epi16`

Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".

int _mm_extract_epi16 (__m128i a, int immediate) PEXTRW reg, xmm, imm8

`mm_insert_epi16`

Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8".

__m128i _mm_insert_epi16 (__m128i a, int i, int immediate) PINSRW xmm, reg/m16, imm8

`mm_load_pd`

Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst". "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.

__m128d _mm_load_pd (double const* mem_address) MOVAPD xmm, m128

`mm_load_sd`

Load a double-precision (64-bit) floating-point element from memory into the lower of "dst", and zero the upper element. "mem_addr" does not need to be aligned on any particular boundary.

__m128d _mm_load_sd (double const* mem_address) MOVSD xmm, m64

`mm_load_si128`

Load 128-bits of integer data from memory into "dst". "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.

__m128i _mm_load_si128 (__m128i const* mem_address) MOVDQA xmm, m128

`mm_loadh_pd`

Load a double-precision (64-bit) floating-point element from memory into the upper element of "dst", and copy the lower element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.

__m128d _mm_loadh_pd (__m128d a, double const* mem_addr) MOVHPD xmm, m64

`mm_loadl_epi32`

__m128i _mm_loadl_epi32 (__m128i const* mem_addr) MOVD xmm, reg/m32

`mm_loadl_epi64`

Load 64-bit integer from memory into the first element of "dst".

__m128i _mm_loadl_epi64 (__m128i const* mem_addr) MOVQ xmm, reg/m64

`mm_loadl_pd`

Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst", and copy the upper element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.

__m128d _mm_loadl_pd (__m128d a, double const* mem_addr) MOVLPD xmm, m64

`mm_loadu_pd`

Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst". "mem_addr" does not need to be aligned on any particular boundary.

__m128d _mm_loadu_pd (double const* mem_address) MOVUPD xmm, m128

`mm_loadu_si128`

Load 128-bits of integer data from memory into "dst". "mem_addr" does not need to be aligned on any particular boundary.

__m128i _mm_loadu_si128 (__m128i const* mem_address) MOVDQU xmm, m128

`mm_madd_epi16`

Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".

__m128i _mm_madd_epi16 (__m128i a, __m128i b) PMADDWD xmm, xmm/m128

`mm_maskmoveu_si128`

Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint. "mem_addr" does not need to be aligned on any particular boundary.

void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_address) MASKMOVDQU xmm, xmm

`mm_max_epi16`

Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst".

__m128i _mm_max_epi16 (__m128i a, __m128i b) PMAXSW xmm, xmm/m128

`mm_max_epu8`

Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".

__m128i _mm_max_epu8 (__m128i a, __m128i b) PMAXUB xmm, xmm/m128

`mm_max_pd`

Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".

__m128d _mm_max_pd (__m128d a, __m128d b) MAXPD xmm, xmm/m128

`mm_max_sd`

Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_max_sd (__m128d a, __m128d b) MAXSD xmm, xmm/m64

`mm_min_epi16`

Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst".

__m128i _mm_min_epi16 (__m128i a, __m128i b) PMINSW xmm, xmm/m128

`mm_min_epu8`

Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".

__m128i _mm_min_epu8 (__m128i a, __m128i b) PMINUB xmm, xmm/m128

`mm_min_pd`

Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".

__m128d _mm_min_pd (__m128d a, __m128d b) MINPD xmm, xmm/m128

`mm_min_sd`

Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_min_sd (__m128d a, __m128d b) MINSD xmm, xmm/m64

`mm_move_epi64`

Copy the lower 64-bit integer in "a" to the lower element of "dst", and zero the upper element.

__m128i _mm_move_epi64 (__m128i a) MOVQ xmm, xmm

`mm_move_sd`

Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_move_sd (__m128d a, __m128d b) MOVSD xmm, xmm

`mm_movemask_epi8`

Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".

int _mm_movemask_epi8 (__m128i a) PMOVMSKB reg, xmm

`mm_movemask_pd`

Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a".

int _mm_movemask_pd (__m128d a) MOVMSKPD reg, xmm

`mm_mul_epu32`

Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst".

__m128i _mm_mul_epu32 (__m128i a, __m128i b) PMULUDQ xmm, xmm/m128

`mm_mul_pd`

Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".

__m128d _mm_mul_pd (__m128d a, __m128d b) MULPD xmm, xmm/m128

`mm_mul_sd`

Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_mul_sd (__m128d a, __m128d b) MULSD xmm, xmm/m64

`mm_mulhi_epi16`

Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".

__m128i _mm_mulhi_epi16 (__m128i a, __m128i b) PMULHW xmm, xmm/m128

`mm_mulhi_epu16`

Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".

__m128i _mm_mulhi_epu16 (__m128i a, __m128i b) PMULHUW xmm, xmm/m128

`mm_mullo_epi16`

Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst".

__m128i _mm_mullo_epi16 (__m128i a, __m128i b) PMULLW xmm, xmm/m128

`mm_or_pd`

Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".

__m128d _mm_or_pd (__m128d a, __m128d b) ORPD xmm, xmm/m128

`mm_or_si128`

Compute the bitwise OR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".

__m128i _mm_or_si128 (__m128i a, __m128i b) POR xmm, xmm/m128

`mm_packs_epi16`

Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".

__m128i _mm_packs_epi16 (__m128i a, __m128i b) PACKSSWB xmm, xmm/m128

`mm_packs_epi32`

Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".

__m128i _mm_packs_epi32 (__m128i a, __m128i b) PACKSSDW xmm, xmm/m128

`mm_packus_epi16`

Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".

__m128i _mm_packus_epi16 (__m128i a, __m128i b) PACKUSWB xmm, xmm/m128

`mm_sad_epu8`

Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst".

__m128i _mm_sad_epu8 (__m128i a, __m128i b) PSADBW xmm, xmm/m128

`mm_shuffle_epi32`

Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst".

__m128i _mm_shuffle_epi32 (__m128i a, int immediate) PSHUFD xmm, xmm/m128, imm8

`mm_shuffle_pd`

Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst".

__m128d _mm_shuffle_pd (__m128d a, __m128d b, int immediate) SHUFPD xmm, xmm/m128, imm8

`mm_shufflehi_epi16`

Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst".

__m128i _mm_shufflehi_epi16 (__m128i a, int immediate) PSHUFHW xmm, xmm/m128, imm8

`mm_shufflelo_epi16`

Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst".

__m128i _mm_shufflelo_epi16 (__m128i a, int control) PSHUFLW xmm, xmm/m128, imm8

`mm_sll_epi16`

Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".

__m128i _mm_sll_epi16 (__m128i a, __m128i count) PSLLW xmm, xmm/m128

`mm_sll_epi32`

Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".

__m128i _mm_sll_epi32 (__m128i a, __m128i count) PSLLD xmm, xmm/m128

`mm_sll_epi64`

Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".

__m128i _mm_sll_epi64 (__m128i a, __m128i count) PSLLQ xmm, xmm/m128

`mm_slli_epi16`

Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".

__m128i _mm_slli_epi16 (__m128i a, int immediate) PSLLW xmm, imm8

`mm_slli_epi32`

Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".

__m128i _mm_slli_epi32 (__m128i a, int immediate) PSLLD xmm, imm8

`mm_slli_epi64`

Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".

__m128i _mm_slli_epi64 (__m128i a, int immediate) PSLLQ xmm, imm8

`mm_sqrt_pd`

Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".

__m128d _mm_sqrt_pd (__m128d a) SQRTPD xmm, xmm/m128

`mm_sqrt_sd`

Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_sqrt_sd (__m128d a, __m128d b) SQRTSD xmm, xmm/64

`mm_sqrt_sd1`

Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_sqrt_sd (__m128d a) SQRTSD xmm, xmm/64

`mm_sra_epi16`

Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".

__m128i _mm_sra_epi16 (__m128i a, __m128i count) PSRAW xmm, xmm/m128

`mm_sra_epi32`

Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".

__m128i _mm_sra_epi32 (__m128i a, __m128i count) PSRAD xmm, xmm/m128

`mm_srai_epi16`

Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".

__m128i _mm_srai_epi16 (__m128i a, int immediate) PSRAW xmm, imm8

`mm_srai_epi32`

Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".

__m128i _mm_srai_epi32 (__m128i a, int immediate) PSRAD xmm, imm8

`mm_srl_epi16`

Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".

__m128i _mm_srl_epi16 (__m128i a, __m128i count) PSRLW xmm, xmm/m128

`mm_srl_epi32`

Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".

__m128i _mm_srl_epi32 (__m128i a, __m128i count) PSRLD xmm, xmm/m128

`mm_srl_epi64`

Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".

__m128i _mm_srl_epi64 (__m128i a, __m128i count) PSRLQ xmm, xmm/m128

`mm_srli_epi16`

Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".

__m128i _mm_srli_epi16 (__m128i a, int immediate) PSRLW xmm, imm8

`mm_srli_epi32`

Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".

__m128i _mm_srli_epi32 (__m128i a, int immediate) PSRLD xmm, imm8

`mm_srli_epi64`

Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".

__m128i _mm_srli_epi64 (__m128i a, int immediate) PSRLQ xmm, imm8

`mm_store_pd`

Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.

void _mm_store_pd (double* mem_addr, __m128d a) MOVAPD m128, xmm

`mm_store_sd`

Store the lower double-precision (64-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.

void _mm_store_sd (double* mem_addr, __m128d a) MOVSD m64, xmm

`mm_store_si128`

Store 128-bits of integer data from "a" into memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.

void _mm_store_si128 (__m128i* mem_addr, __m128i a) MOVDQA m128, xmm

`mm_storeh_pd`

Store the upper double-precision (64-bit) floating-point element from "a" into memory.

void _mm_storeh_pd (double* mem_addr, __m128d a) MOVHPD m64, xmm

`mm_storel_epi64`

Store 64-bit integer from the first element of "a" into memory.

void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) MOVQ m64, xmm

`mm_storel_pd`

Store the lower double-precision (64-bit) floating-point element from "a" into memory.

void _mm_storel_pd (double* mem_addr, __m128d a) MOVLPD m64, xmm

`mm_storeu_pd`

Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.

void _mm_storeu_pd (double* mem_addr, __m128d a) MOVUPD m128, xmm

`mm_storeu_si128`

Store 128-bits of integer data from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.

void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) MOVDQU m128, xmm

`mm_storeu_si32`

Store 32-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.

void _mm_storeu_si32 (void* mem_addr, __m128i a) MOVD m32, xmm

`mm_stream_pd`

Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.

void _mm_stream_pd (double* mem_addr, __m128d a) MOVNTPD m128, xmm

`mm_stream_si128`

Store 128-bits of integer data from "a" into memory using a non-temporal memory hint. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.

void _mm_stream_si128 (__m128i* mem_addr, __m128i a) MOVNTDQ m128, xmm

`mm_stream_si32`

Store 32-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated.

void _mm_stream_si32(int *p, int a) MOVNTI m32, r32

`mm_sub_epi16`

Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".

__m128i _mm_sub_epi16 (__m128i a, __m128i b) PSUBW xmm, xmm/m128

`mm_sub_epi32`

Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".

__m128i _mm_sub_epi32 (__m128i a, __m128i b) PSUBD xmm, xmm/m128

`mm_sub_epi64`

Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst".

__m128i _mm_sub_epi64 (__m128i a, __m128i b) PSUBQ xmm, xmm/m128

`mm_sub_epi8`

Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".

__m128i _mm_sub_epi8 (__m128i a, __m128i b) PSUBB xmm, xmm/m128

`mm_sub_pd`

Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".

__m128d _mm_sub_pd (__m128d a, __m128d b) SUBPD xmm, xmm/m128

`mm_sub_sd`

Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".

__m128d _mm_sub_sd (__m128d a, __m128d b) SUBSD xmm, xmm/m64

`mm_subs_epi16`

Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".

__m128i _mm_subs_epi16 (__m128i a, __m128i b) PSUBSW xmm, xmm/m128

`mm_subs_epi8`

Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".

__m128i _mm_subs_epi8 (__m128i a, __m128i b) PSUBSB xmm, xmm/m128

`mm_subs_epu16`

Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".

__m128i _mm_subs_epu16 (__m128i a, __m128i b) PSUBUSW xmm, xmm/m128

`mm_subs_epu8`

Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".

__m128i _mm_subs_epu8 (__m128i a, __m128i b) PSUBUSB xmm, xmm/m128

`mm_ucomieq_sd`

Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

int _mm_ucomieq_sd (__m128d a, __m128d b) UCOMISD xmm, xmm/m64

`mm_ucomige_sd`

Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

int _mm_ucomige_sd (__m128d a, __m128d b) UCOMISD xmm, xmm/m64

`mm_ucomigt_sd`

Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

int _mm_ucomigt_sd (__m128d a, __m128d b) UCOMISD xmm, xmm/m64

`mm_ucomile_sd`

Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

int _mm_ucomile_sd (__m128d a, __m128d b) UCOMISD xmm, xmm/m64

`mm_ucomilt_sd`

Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

int _mm_ucomilt_sd (__m128d a, __m128d b) UCOMISD xmm, xmm/m64

`mm_ucomineq_sd`

Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

int _mm_ucomineq_sd (__m128d a, __m128d b) UCOMISD xmm, xmm/m64

`mm_unpackhi_epi16`

Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst".

__m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) PUNPCKHWD xmm, xmm/m128

`mm_unpackhi_epi32`

Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst".

__m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) PUNPCKHDQ xmm, xmm/m128

`mm_unpackhi_epi64`

Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst".

__m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) PUNPCKHQDQ xmm, xmm/m128

`mm_unpackhi_epi8`

Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst".

__m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) PUNPCKHBW xmm, xmm/m128

`mm_unpackhi_pd`

Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst".

__m128d _mm_unpackhi_pd (__m128d a, __m128d b) UNPCKHPD xmm, xmm/m128

`mm_unpacklo_epi16`

Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst".

__m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) PUNPCKLWD xmm, xmm/m128

`mm_unpacklo_epi32`

Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst".

__m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) PUNPCKLDQ xmm, xmm/m128

`mm_unpacklo_epi64`

Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst".

__m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) PUNPCKLQDQ xmm, xmm/m128

`mm_unpacklo_epi8`

Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst".

__m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) PUNPCKLBW xmm, xmm/m128

`mm_unpacklo_pd`

Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst".

__m128d _mm_unpacklo_pd (__m128d a, __m128d b) UNPCKLPD xmm, xmm/m128

`mm_xor_pd`

Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".

__m128d _mm_xor_pd (__m128d a, __m128d b) XORPD xmm, xmm/m128

`mm_xor_si128`

Compute the bitwise XOR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".

__m128i _mm_xor_si128 (__m128i a, __m128i b) PXOR xmm, xmm/m128