diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 7a09822a1dd5..e2a680d73ab0 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -172,6 +172,20 @@ AMDGPU Support X86 Support ^^^^^^^^^^^ +- The MMX vector intrinsic functions from ``*mmintrin.h`` which + operate on `__m64` vectors, such as ``_mm_add_pi8``, have been + reimplemented to use the SSE2 instruction-set and XMM registers + unconditionally. These intrinsics are therefore *no longer + supported* if MMX is enabled without SSE2 -- either from targeting + CPUs from the Pentium-MMX through the Pentium 3, or explicitly via + passing arguments such as ``-mmmx -mno-sse2``. + +- The compiler builtins such as ``__builtin_ia32_paddb`` which + formerly implemented the above MMX intrinsic functions have been + removed. Any uses of these removed functions should migrate to the + functions defined by the ``*mmintrin.h`` headers. A mapping can be + found in the file ``clang/www/builtins.py``. + Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index a85e7918f4d7..06ca30d65f5b 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -47,107 +47,8 @@ TARGET_BUILTIN(__builtin_ia32_writeeflags_u32, "vUi", "n", "") // doesn't work in the presence of re-declaration of _mm_prefetch for windows. TARGET_BUILTIN(_mm_prefetch, "vcC*i", "nc", "mmx") TARGET_BUILTIN(__builtin_ia32_emms, "v", "n", "mmx") -TARGET_BUILTIN(__builtin_ia32_paddb, "V8cV8cV8c", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_paddw, "V4sV4sV4s", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_paddd, "V2iV2iV2i", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_paddsb, "V8cV8cV8c", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_paddsw, "V4sV4sV4s", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_paddusb, "V8cV8cV8c", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_paddusw, "V4sV4sV4s", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psubb, "V8cV8cV8c", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psubw, "V4sV4sV4s", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psubd, "V2iV2iV2i", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psubsb, "V8cV8cV8c", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psubsw, "V4sV4sV4s", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psubusb, "V8cV8cV8c", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psubusw, "V4sV4sV4s", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_pmulhw, "V4sV4sV4s", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_pmullw, "V4sV4sV4s", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_pmaddwd, "V2iV4sV4s", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_pand, "V1OiV1OiV1Oi", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_pandn, "V1OiV1OiV1Oi", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_por, "V1OiV1OiV1Oi", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_pxor, "V1OiV1OiV1Oi", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psllw, "V4sV4sV1Oi", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_pslld, "V2iV2iV1Oi", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psllq, "V1OiV1OiV1Oi", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psrlw, "V4sV4sV1Oi", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psrld, "V2iV2iV1Oi", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psrlq, "V1OiV1OiV1Oi", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psraw, "V4sV4sV1Oi", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psrad, "V2iV2iV1Oi", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psllwi, "V4sV4si", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_pslldi, "V2iV2ii", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psllqi, "V1OiV1Oii", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psrlwi, "V4sV4si", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psrldi, "V2iV2ii", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psrlqi, "V1OiV1Oii", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psrawi, "V4sV4si", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_psradi, "V2iV2ii", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_packsswb, "V8cV4sV4s", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_packssdw, "V4sV2iV2i", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_packuswb, "V8cV4sV4s", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_punpckhbw, "V8cV8cV8c", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_punpckhwd, "V4sV4sV4s", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_punpckhdq, "V2iV2iV2i", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_punpcklbw, "V8cV8cV8c", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_punpcklwd, "V4sV4sV4s", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_punpckldq, "V2iV2iV2i", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_pcmpeqb, "V8cV8cV8c", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_pcmpeqw, "V4sV4sV4s", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_pcmpeqd, "V2iV2iV2i", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_pcmpgtb, "V8cV8cV8c", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_pcmpgtw, "V4sV4sV4s", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_pcmpgtd, "V2iV2iV2i", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_maskmovq, "vV8cV8cc*", "nV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_movntq, "vV1Oi*V1Oi", "nV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_vec_init_v2si, "V2iii", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_vec_init_v4hi, "V4sssss", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_vec_init_v8qi, "V8ccccccccc", "ncV:64:", "mmx") -TARGET_BUILTIN(__builtin_ia32_vec_ext_v2si, "iV2ii", "ncV:64:", "mmx") - -// MMX2 (MMX+SSE) intrinsics -TARGET_BUILTIN(__builtin_ia32_cvtpi2ps, "V4fV4fV2i", "ncV:64:", "mmx,sse") -TARGET_BUILTIN(__builtin_ia32_cvtps2pi, "V2iV4f", "ncV:64:", "mmx,sse") -TARGET_BUILTIN(__builtin_ia32_cvttps2pi, "V2iV4f", "ncV:64:", "mmx,sse") -TARGET_BUILTIN(__builtin_ia32_pavgb, "V8cV8cV8c", "ncV:64:", "mmx,sse") -TARGET_BUILTIN(__builtin_ia32_pavgw, "V4sV4sV4s", "ncV:64:", "mmx,sse") -TARGET_BUILTIN(__builtin_ia32_pmaxsw, "V4sV4sV4s", "ncV:64:", "mmx,sse") -TARGET_BUILTIN(__builtin_ia32_pmaxub, "V8cV8cV8c", "ncV:64:", "mmx,sse") -TARGET_BUILTIN(__builtin_ia32_pminsw, "V4sV4sV4s", "ncV:64:", "mmx,sse") -TARGET_BUILTIN(__builtin_ia32_pminub, "V8cV8cV8c", "ncV:64:", "mmx,sse") -TARGET_BUILTIN(__builtin_ia32_pmovmskb, "iV8c", "ncV:64:", "mmx,sse") -TARGET_BUILTIN(__builtin_ia32_pmulhuw, "V4sV4sV4s", "ncV:64:", "mmx,sse") -TARGET_BUILTIN(__builtin_ia32_psadbw, "V4sV8cV8c", "ncV:64:", "mmx,sse") -TARGET_BUILTIN(__builtin_ia32_pshufw, "V4sV4sIc", "ncV:64:", "mmx,sse") -TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "iV4sIi", "ncV:64:", "mmx,sse") -TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4siIi", "ncV:64:", "mmx,sse") - -// MMX+SSE2 -TARGET_BUILTIN(__builtin_ia32_cvtpd2pi, "V2iV2d", "ncV:64:", "mmx,sse2") -TARGET_BUILTIN(__builtin_ia32_cvtpi2pd, "V2dV2i", "ncV:64:", "mmx,sse2") -TARGET_BUILTIN(__builtin_ia32_cvttpd2pi, "V2iV2d", "ncV:64:", "mmx,sse2") -TARGET_BUILTIN(__builtin_ia32_paddq, "V1OiV1OiV1Oi", "ncV:64:", "mmx,sse2") -TARGET_BUILTIN(__builtin_ia32_pmuludq, "V1OiV2iV2i", "ncV:64:", "mmx,sse2") -TARGET_BUILTIN(__builtin_ia32_psubq, "V1OiV1OiV1Oi", "ncV:64:", "mmx,sse2") - -// MMX+SSSE3 -TARGET_BUILTIN(__builtin_ia32_pabsb, "V8cV8c", "ncV:64:", "mmx,ssse3") -TARGET_BUILTIN(__builtin_ia32_pabsd, "V2iV2i", "ncV:64:", "mmx,ssse3") -TARGET_BUILTIN(__builtin_ia32_pabsw, "V4sV4s", "ncV:64:", "mmx,ssse3") -TARGET_BUILTIN(__builtin_ia32_palignr, "V8cV8cV8cIc", "ncV:64:", "mmx,ssse3") -TARGET_BUILTIN(__builtin_ia32_phaddd, "V2iV2iV2i", "ncV:64:", "mmx,ssse3") -TARGET_BUILTIN(__builtin_ia32_phaddsw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3") -TARGET_BUILTIN(__builtin_ia32_phaddw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3") -TARGET_BUILTIN(__builtin_ia32_phsubd, "V2iV2iV2i", "ncV:64:", "mmx,ssse3") -TARGET_BUILTIN(__builtin_ia32_phsubsw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3") -TARGET_BUILTIN(__builtin_ia32_phsubw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3") -TARGET_BUILTIN(__builtin_ia32_pmaddubsw, "V8cV8cV8c", "ncV:64:", "mmx,ssse3") -TARGET_BUILTIN(__builtin_ia32_pmulhrsw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3") -TARGET_BUILTIN(__builtin_ia32_pshufb, "V8cV8cV8c", "ncV:64:", "mmx,ssse3") -TARGET_BUILTIN(__builtin_ia32_psignw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3") -TARGET_BUILTIN(__builtin_ia32_psignb, "V8cV8cV8c", "ncV:64:", "mmx,ssse3") -TARGET_BUILTIN(__builtin_ia32_psignd, "V2iV2iV2i", "ncV:64:", "mmx,ssse3") +TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "sV4sIi", "ncV:64:", "sse") +TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4ssIi", "ncV:64:", "sse") // SSE intrinsics. TARGET_BUILTIN(__builtin_ia32_comieq, "iV4fV4f", "ncV:128:", "sse") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index f1dee801e4fe..0c4d0efb70ea 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14523,12 +14523,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // TODO: If we had a "freeze" IR instruction to generate a fixed undef // value, we should use that here instead of a zero. return llvm::Constant::getNullValue(ConvertType(E->getType())); - case X86::BI__builtin_ia32_vec_init_v8qi: - case X86::BI__builtin_ia32_vec_init_v4hi: - case X86::BI__builtin_ia32_vec_init_v2si: - return Builder.CreateBitCast(BuildVector(Ops), - llvm::Type::getX86_MMXTy(getLLVMContext())); - case X86::BI__builtin_ia32_vec_ext_v2si: + case X86::BI__builtin_ia32_vec_ext_v4hi: case X86::BI__builtin_ia32_vec_ext_v16qi: case X86::BI__builtin_ia32_vec_ext_v8hi: case X86::BI__builtin_ia32_vec_ext_v4si: @@ -14546,6 +14541,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // Otherwise we could just do this in the header file. return Builder.CreateExtractElement(Ops[0], Index); } + case X86::BI__builtin_ia32_vec_set_v4hi: case X86::BI__builtin_ia32_vec_set_v16qi: case X86::BI__builtin_ia32_vec_set_v8hi: case X86::BI__builtin_ia32_vec_set_v4si: diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h index e85bfc47aa5c..a3176570a468 100644 --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -52,9 +52,12 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16))); #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, \ __target__("sse2,no-evex512"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS_MMX \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("mmx,sse2,no-evex512"), __min_vector_width__(64))) + +#define __trunc64(x) \ + (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0) +#define __anyext128(x) \ + (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ + 1, -1, -1) /// Adds lower double-precision values in both operands and returns the /// sum in the lower 64 bits of the result. The upper 64 bits of the result @@ -1486,8 +1489,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) { /// \param __a /// A 128-bit vector of [2 x double]. /// \returns A 64-bit vector of [2 x i32] containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) { - return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) { + return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a)); } /// Converts the two double-precision floating-point elements of a @@ -1505,8 +1508,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) { /// \param __a /// A 128-bit vector of [2 x double]. /// \returns A 64-bit vector of [2 x i32] containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) { - return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) { + return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a)); } /// Converts the two signed 32-bit integer elements of a 64-bit vector of @@ -1520,8 +1523,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) { /// \param __a /// A 64-bit vector of [2 x i32]. /// \returns A 128-bit vector of [2 x double] containing the converted values. -static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) { - return __builtin_ia32_cvtpi2pd((__v2si)__a); +static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtpi32_pd(__m64 __a) { + return (__m128d) __builtin_convertvector((__v2si)__a, __v2df); } /// Returns the low-order element of a 128-bit vector of [2 x double] as @@ -2108,9 +2111,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, /// \param __b /// A 64-bit integer. /// \returns A 64-bit integer containing the sum of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, - __m64 __b) { - return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) { + return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b)); } /// Adds the corresponding elements of two 128-bit vectors of [2 x i64], @@ -2431,9 +2433,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, /// \param __b /// A 64-bit integer containing one of the source operands. /// \returns A 64-bit integer vector containing the product of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, - __m64 __b) { - return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) { + return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a), + (__v4si)__anyext128(__b))); } /// Multiplies 32-bit unsigned integer values contained in the lower @@ -2539,9 +2541,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, /// A 64-bit integer vector containing the subtrahend. /// \returns A 64-bit integer vector containing the difference of the values in /// the operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, - __m64 __b) { - return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) { + return (__m64)((unsigned long long)__a - (unsigned long long)__b); } /// Subtracts the corresponding elements of two [2 x i64] vectors. @@ -4889,8 +4890,10 @@ void _mm_pause(void); #if defined(__cplusplus) } // extern "C" #endif + +#undef __anyext128 +#undef __trunc64 #undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_MMX #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h index 4e154e2d8593..9d1e135be63b 100644 --- a/clang/lib/Headers/mmintrin.h +++ b/clang/lib/Headers/mmintrin.h @@ -21,10 +21,33 @@ typedef int __v2si __attribute__((__vector_size__(8))); typedef short __v4hi __attribute__((__vector_size__(8))); typedef char __v8qi __attribute__((__vector_size__(8))); +/* Unsigned types */ +typedef unsigned long long __v1du __attribute__ ((__vector_size__ (8))); +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8))); +typedef unsigned short __v4hu __attribute__((__vector_size__(8))); +typedef unsigned char __v8qu __attribute__((__vector_size__(8))); + +/* We need an explicitly signed variant for char. Note that this shouldn't + * appear in the interface though. */ +typedef signed char __v8qs __attribute__((__vector_size__(8))); + +/* SSE/SSE2 types */ +typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); +typedef long long __v2di __attribute__ ((__vector_size__ (16))); +typedef int __v4si __attribute__((__vector_size__(16))); +typedef short __v8hi __attribute__((__vector_size__(16))); +typedef char __v16qi __attribute__((__vector_size__(16))); + /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("mmx,no-evex512"), \ - __min_vector_width__(64))) +#define __DEFAULT_FN_ATTRS_SSE2 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("sse2,no-evex512"), __min_vector_width__(128))) + +#define __trunc64(x) \ + (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0) +#define __anyext128(x) \ + (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ + 1, -1, -1) /// Clears the MMX state by setting the state of the x87 stack registers /// to empty. @@ -50,10 +73,10 @@ _mm_empty(void) { /// A 32-bit integer value. /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the /// parameter. The upper 32 bits are set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi32_si64(int __i) { - return (__m64)__builtin_ia32_vec_init_v2si(__i, 0); + return __extension__ (__m64)(__v2si){__i, 0}; } /// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit @@ -67,10 +90,10 @@ _mm_cvtsi32_si64(int __i) /// A 64-bit integer vector. /// \returns A 32-bit signed integer value containing the lower 32 bits of the /// parameter. -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi64_si32(__m64 __m) { - return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0); + return ((__v2si)__m)[0]; } /// Casts a 64-bit signed integer value into a 64-bit integer vector. @@ -83,7 +106,7 @@ _mm_cvtsi64_si32(__m64 __m) /// A 64-bit signed integer. /// \returns A 64-bit integer vector containing the same bitwise pattern as the /// parameter. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi64_m64(long long __i) { return (__m64)__i; @@ -99,7 +122,7 @@ _mm_cvtsi64_m64(long long __i) /// A 64-bit integer vector. /// \returns A 64-bit signed integer containing the same bitwise pattern as the /// parameter. -static __inline__ long long __DEFAULT_FN_ATTRS +static __inline__ long long __DEFAULT_FN_ATTRS_SSE2 _mm_cvtm64_si64(__m64 __m) { return (long long)__m; @@ -124,10 +147,11 @@ _mm_cvtm64_si64(__m64 __m) /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2); + return __trunc64(__builtin_ia32_packsswb128( + (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); } /// Converts, with saturation, 32-bit signed integers from both 64-bit integer @@ -149,10 +173,11 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2) /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2); + return __trunc64(__builtin_ia32_packssdw128( + (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){})); } /// Converts, with saturation, 16-bit signed integers from both 64-bit integer @@ -174,10 +199,11 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2) /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pu16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2); + return __trunc64(__builtin_ia32_packuswb128( + (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); } /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8] @@ -201,10 +227,11 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2) /// Bits [63:56] are written to bits [63:56] of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2); + return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, + 4, 12, 5, 13, 6, 14, 7, 15); } /// Unpacks the upper 32 bits from two 64-bit integer vectors of @@ -224,10 +251,11 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) /// Bits [63:48] are written to bits [63:48] of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2); + return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, + 2, 6, 3, 7); } /// Unpacks the upper 32 bits from two 64-bit integer vectors of @@ -245,10 +273,10 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) /// the upper 32 bits of the result. /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2); + return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3); } /// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8] @@ -272,10 +300,11 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) /// Bits [31:24] are written to bits [63:56] of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2); + return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, + 0, 8, 1, 9, 2, 10, 3, 11); } /// Unpacks the lower 32 bits from two 64-bit integer vectors of @@ -295,10 +324,11 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) /// Bits [31:16] are written to bits [63:48] of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2); + return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, + 0, 4, 1, 5); } /// Unpacks the lower 32 bits from two 64-bit integer vectors of @@ -316,10 +346,10 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) /// the upper 32 bits of the result. /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2); + return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2); } /// Adds each 8-bit integer element of the first 64-bit integer vector @@ -337,10 +367,10 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2)); } /// Adds each 16-bit integer element of the first 64-bit integer vector @@ -358,10 +388,10 @@ _mm_add_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2)); } /// Adds each 32-bit integer element of the first 64-bit integer vector @@ -379,10 +409,10 @@ _mm_add_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2); + return (__m64)(((__v2su)__m1) + ((__v2su)__m2)); } /// Adds, with saturation, each 8-bit signed integer element of the first @@ -403,10 +433,10 @@ _mm_add_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums /// of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)__builtin_elementwise_add_sat((__v8qs)__m1, (__v8qs)__m2); } /// Adds, with saturation, each 16-bit signed integer element of the first @@ -427,10 +457,10 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums /// of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)__builtin_elementwise_add_sat((__v4hi)__m1, (__v4hi)__m2); } /// Adds, with saturation, each 8-bit unsigned integer element of the first @@ -450,10 +480,10 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// unsigned sums of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pu8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)__builtin_elementwise_add_sat((__v8qu)__m1, (__v8qu)__m2); } /// Adds, with saturation, each 16-bit unsigned integer element of the first @@ -473,10 +503,10 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// unsigned sums of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pu16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)__builtin_elementwise_add_sat((__v4hu)__m1, (__v4hu)__m2); } /// Subtracts each 8-bit integer element of the second 64-bit integer @@ -494,10 +524,10 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2)); } /// Subtracts each 16-bit integer element of the second 64-bit integer @@ -515,10 +545,10 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2)); } /// Subtracts each 32-bit integer element of the second 64-bit integer @@ -536,10 +566,10 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32] containing the subtrahends. /// \returns A 64-bit integer vector of [2 x i32] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2); + return (__m64)(((__v2su)__m1) - ((__v2su)__m2)); } /// Subtracts, with saturation, each 8-bit signed integer element of the second @@ -560,10 +590,10 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)__builtin_elementwise_sub_sat((__v8qs)__m1, (__v8qs)__m2); } /// Subtracts, with saturation, each 16-bit signed integer element of the @@ -584,10 +614,10 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)__builtin_elementwise_sub_sat((__v4hi)__m1, (__v4hi)__m2); } /// Subtracts each 8-bit unsigned integer element of the second 64-bit @@ -608,10 +638,10 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pu8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)__builtin_elementwise_sub_sat((__v8qu)__m1, (__v8qu)__m2); } /// Subtracts each 16-bit unsigned integer element of the second 64-bit @@ -632,10 +662,10 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pu16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)__builtin_elementwise_sub_sat((__v4hu)__m1, (__v4hu)__m2); } /// Multiplies each 16-bit signed integer element of the first 64-bit @@ -659,10 +689,11 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [2 x i32] containing the sums of /// products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_madd_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2); + return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1), + (__v8hi)__anyext128(__m2))); } /// Multiplies each 16-bit signed integer element of the first 64-bit @@ -680,10 +711,11 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits /// of the products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2); + return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1), + (__v8hi)__anyext128(__m2))); } /// Multiplies each 16-bit signed integer element of the first 64-bit @@ -701,10 +733,10 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits /// of the products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mullo_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2)); } /// Left-shifts each 16-bit signed integer element of the first @@ -724,10 +756,11 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2) /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted /// values. If \a __count is greater or equal to 16, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_pi16(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m), + (__v8hi)__anyext128(__count))); } /// Left-shifts each 16-bit signed integer element of a 64-bit integer @@ -746,10 +779,11 @@ _mm_sll_pi16(__m64 __m, __m64 __count) /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted /// values. If \a __count is greater or equal to 16, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_pi16(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m), + __count)); } /// Left-shifts each 32-bit signed integer element of the first @@ -769,10 +803,11 @@ _mm_slli_pi16(__m64 __m, int __count) /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted /// values. If \a __count is greater or equal to 32, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_pi32(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_pslld((__v2si)__m, __count); + return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m), + (__v4si)__anyext128(__count))); } /// Left-shifts each 32-bit signed integer element of a 64-bit integer @@ -791,10 +826,11 @@ _mm_sll_pi32(__m64 __m, __m64 __count) /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted /// values. If \a __count is greater or equal to 32, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_pi32(__m64 __m, int __count) { - return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count); + return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m), + __count)); } /// Left-shifts the first 64-bit integer parameter by the number of bits @@ -811,10 +847,11 @@ _mm_slli_pi32(__m64 __m, int __count) /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector containing the left-shifted value. If /// \a __count is greater or equal to 64, the result is set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_si64(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psllq((__v1di)__m, __count); + return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m), + (__v2di)__anyext128(__count))); } /// Left-shifts the first parameter, which is a 64-bit integer, by the @@ -831,10 +868,11 @@ _mm_sll_si64(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector containing the left-shifted value. If /// \a __count is greater or equal to 64, the result is set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_si64(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count); + return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m), + __count)); } /// Right-shifts each 16-bit integer element of the first parameter, @@ -855,10 +893,11 @@ _mm_slli_si64(__m64 __m, int __count) /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sra_pi16(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m), + (__v8hi)__anyext128(__count))); } /// Right-shifts each 16-bit integer element of a 64-bit integer vector @@ -878,10 +917,11 @@ _mm_sra_pi16(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srai_pi16(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m), + __count)); } /// Right-shifts each 32-bit integer element of the first parameter, @@ -902,10 +942,11 @@ _mm_srai_pi16(__m64 __m, int __count) /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sra_pi32(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrad((__v2si)__m, __count); + return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m), + (__v4si)__anyext128(__count))); } /// Right-shifts each 32-bit integer element of a 64-bit integer vector @@ -925,10 +966,11 @@ _mm_sra_pi32(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srai_pi32(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psradi((__v2si)__m, __count); + return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m), + __count)); } /// Right-shifts each 16-bit integer element of the first parameter, @@ -948,10 +990,11 @@ _mm_srai_pi32(__m64 __m, int __count) /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_pi16(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m), + (__v8hi)__anyext128(__count))); } /// Right-shifts each 16-bit integer element of a 64-bit integer vector @@ -970,10 +1013,11 @@ _mm_srl_pi16(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_pi16(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m), + __count)); } /// Right-shifts each 32-bit integer element of the first parameter, @@ -993,10 +1037,11 @@ _mm_srli_pi16(__m64 __m, int __count) /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_pi32(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrld((__v2si)__m, __count); + return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m), + (__v4si)__anyext128(__count))); } /// Right-shifts each 32-bit integer element of a 64-bit integer vector @@ -1015,10 +1060,11 @@ _mm_srl_pi32(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_pi32(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count); + return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m), + __count)); } /// Right-shifts the first 64-bit integer parameter by the number of bits @@ -1035,10 +1081,11 @@ _mm_srli_pi32(__m64 __m, int __count) /// \param __count /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector containing the right-shifted value. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_si64(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count); + return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m), + (__v2di)__anyext128(__count))); } /// Right-shifts the first parameter, which is a 64-bit integer, by the @@ -1056,10 +1103,11 @@ _mm_srl_si64(__m64 __m, __m64 __count) /// \param __count /// A 32-bit integer value. /// \returns A 64-bit integer vector containing the right-shifted value. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_si64(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count); + return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m), + __count)); } /// Performs a bitwise AND of two 64-bit integer vectors. @@ -1074,10 +1122,10 @@ _mm_srli_si64(__m64 __m, int __count) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise AND of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_and_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2); + return (__m64)(((__v1du)__m1) & ((__v1du)__m2)); } /// Performs a bitwise NOT of the first 64-bit integer vector, and then @@ -1095,10 +1143,10 @@ _mm_and_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise AND of the second /// parameter and the one's complement of the first parameter. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_andnot_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2); + return (__m64)(~((__v1du)__m1) & ((__v1du)__m2)); } /// Performs a bitwise OR of two 64-bit integer vectors. @@ -1113,10 +1161,10 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise OR of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_or_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2); + return (__m64)(((__v1du)__m1) | ((__v1du)__m2)); } /// Performs a bitwise exclusive OR of two 64-bit integer vectors. @@ -1131,10 +1179,10 @@ _mm_or_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_xor_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2); + return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2)); } /// Compares the 8-bit integer elements of two 64-bit integer vectors of @@ -1153,10 +1201,10 @@ _mm_xor_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2)); } /// Compares the 16-bit integer elements of two 64-bit integer vectors of @@ -1175,10 +1223,10 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2)); } /// Compares the 32-bit integer elements of two 64-bit integer vectors of @@ -1197,10 +1245,10 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2); + return (__m64)(((__v2si)__m1) == ((__v2si)__m2)); } /// Compares the 8-bit integer elements of two 64-bit integer vectors of @@ -1219,10 +1267,12 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2); + /* This function always performs a signed comparison, but __v8qi is a char + which may be signed or unsigned, so use __v8qs. */ + return (__m64)((__v8qs)__m1 > (__v8qs)__m2); } /// Compares the 16-bit integer elements of two 64-bit integer vectors of @@ -1241,10 +1291,10 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)((__v4hi)__m1 > (__v4hi)__m2); } /// Compares the 32-bit integer elements of two 64-bit integer vectors of @@ -1263,10 +1313,10 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2); + return (__m64)((__v2si)__m1 > (__v2si)__m2); } /// Constructs a 64-bit integer vector initialized to zero. @@ -1276,7 +1326,7 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) /// This intrinsic corresponds to the PXOR instruction. /// /// \returns An initialized 64-bit integer vector with all elements set to zero. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setzero_si64(void) { return __extension__ (__m64){ 0LL }; @@ -1297,10 +1347,10 @@ _mm_setzero_si64(void) /// A 32-bit integer value used to initialize the lower 32 bits of the /// result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set_pi32(int __i1, int __i0) { - return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1); + return __extension__ (__m64)(__v2si){__i0, __i1}; } /// Constructs a 64-bit integer vector initialized with the specified @@ -1320,10 +1370,10 @@ _mm_set_pi32(int __i1, int __i0) /// \param __s0 /// A 16-bit integer value used to initialize bits [15:0] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set_pi16(short __s3, short __s2, short __s1, short __s0) { - return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3); + return __extension__ (__m64)(__v4hi){__s0, __s1, __s2, __s3}; } /// Constructs a 64-bit integer vector initialized with the specified @@ -1351,12 +1401,12 @@ _mm_set_pi16(short __s3, short __s2, short __s1, short __s0) /// \param __b0 /// An 8-bit integer value used to initialize bits [7:0] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) { - return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3, - __b4, __b5, __b6, __b7); + return __extension__ (__m64)(__v8qi){__b0, __b1, __b2, __b3, + __b4, __b5, __b6, __b7}; } /// Constructs a 64-bit integer vector of [2 x i32], with each of the @@ -1372,7 +1422,7 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, /// A 32-bit integer value used to initialize each vector element of the /// result. /// \returns An initialized 64-bit integer vector of [2 x i32]. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set1_pi32(int __i) { return _mm_set_pi32(__i, __i); @@ -1391,7 +1441,7 @@ _mm_set1_pi32(int __i) /// A 16-bit integer value used to initialize each vector element of the /// result. /// \returns An initialized 64-bit integer vector of [4 x i16]. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set1_pi16(short __w) { return _mm_set_pi16(__w, __w, __w, __w); @@ -1409,7 +1459,7 @@ _mm_set1_pi16(short __w) /// An 8-bit integer value used to initialize each vector element of the /// result. /// \returns An initialized 64-bit integer vector of [8 x i8]. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set1_pi8(char __b) { return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b); @@ -1430,7 +1480,7 @@ _mm_set1_pi8(char __b) /// A 32-bit integer value used to initialize the upper 32 bits of the /// result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setr_pi32(int __i0, int __i1) { return _mm_set_pi32(__i1, __i0); @@ -1453,7 +1503,7 @@ _mm_setr_pi32(int __i0, int __i1) /// \param __w3 /// A 16-bit integer value used to initialize bits [63:48] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { return _mm_set_pi16(__w3, __w2, __w1, __w0); @@ -1484,14 +1534,16 @@ _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) /// \param __b7 /// An 8-bit integer value used to initialize bits [63:56] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7) { return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); } -#undef __DEFAULT_FN_ATTRS +#undef __anyext128 +#undef __trunc64 +#undef __DEFAULT_FN_ATTRS_SSE2 /* Aliases for compatibility. */ #define _m_empty _mm_empty diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h index bf8327b692d1..bd832ce8dddf 100644 --- a/clang/lib/Headers/tmmintrin.h +++ b/clang/lib/Headers/tmmintrin.h @@ -19,11 +19,13 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, \ - __target__("ssse3,no-evex512"), __min_vector_width__(64))) -#define __DEFAULT_FN_ATTRS_MMX \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("mmx,ssse3,no-evex512"), \ - __min_vector_width__(64))) + __target__("ssse3,no-evex512"), __min_vector_width__(128))) + +#define __trunc64(x) \ + (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0) +#define __anyext128(x) \ + (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ + 1, -1, -1) /// Computes the absolute value of each of the packed 8-bit signed /// integers in the source operand and stores the 8-bit unsigned integer @@ -37,10 +39,10 @@ /// A 64-bit vector of [8 x i8]. /// \returns A 64-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_abs_pi8(__m64 __a) { - return (__m64)__builtin_ia32_pabsb((__v8qi)__a); + return (__m64)__builtin_elementwise_abs((__v8qs)__a); } /// Computes the absolute value of each of the packed 8-bit signed @@ -73,10 +75,10 @@ _mm_abs_epi8(__m128i __a) /// A 64-bit vector of [4 x i16]. /// \returns A 64-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_abs_pi16(__m64 __a) { - return (__m64)__builtin_ia32_pabsw((__v4hi)__a); + return (__m64)__builtin_elementwise_abs((__v4hi)__a); } /// Computes the absolute value of each of the packed 16-bit signed @@ -109,10 +111,10 @@ _mm_abs_epi16(__m128i __a) /// A 64-bit vector of [2 x i32]. /// \returns A 64-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_abs_pi32(__m64 __a) { - return (__m64)__builtin_ia32_pabsd((__v2si)__a); + return (__m64)__builtin_elementwise_abs((__v2si)__a); } /// Computes the absolute value of each of the packed 32-bit signed @@ -177,7 +179,10 @@ _mm_abs_epi32(__m128i __a) /// \returns A 64-bit integer vector containing the concatenated right-shifted /// value. #define _mm_alignr_pi8(a, b, n) \ - ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))) + ((__m64)__builtin_shufflevector( \ + __builtin_ia32_psrldqi128_byteshift( \ + __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0), \ + (n)), __extension__ (__v2di){}, 0)) /// Horizontally adds the adjacent pairs of values contained in 2 packed /// 128-bit vectors of [8 x i16]. @@ -242,10 +247,11 @@ _mm_hadd_epi32(__m128i __a, __m128i __b) /// destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both /// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hadd_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_phaddw128( + (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } /// Horizontally adds the adjacent pairs of values contained in 2 packed @@ -265,10 +271,11 @@ _mm_hadd_pi16(__m64 __a, __m64 __b) /// destination. /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both /// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hadd_pi32(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b); + return __trunc64(__builtin_ia32_phaddd128( + (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){})); } /// Horizontally adds, with saturation, the adjacent pairs of values contained @@ -317,10 +324,11 @@ _mm_hadds_epi16(__m128i __a, __m128i __b) /// destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated /// sums of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hadds_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_phaddsw128( + (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } /// Horizontally subtracts the adjacent pairs of values contained in 2 @@ -386,10 +394,11 @@ _mm_hsub_epi32(__m128i __a, __m128i __b) /// the destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences /// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hsub_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_phsubw128( + (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } /// Horizontally subtracts the adjacent pairs of values contained in 2 @@ -409,10 +418,11 @@ _mm_hsub_pi16(__m64 __a, __m64 __b) /// the destination. /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences /// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hsub_pi32(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b); + return __trunc64(__builtin_ia32_phsubd128( + (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){})); } /// Horizontally subtracts, with saturation, the adjacent pairs of values @@ -461,10 +471,11 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b) /// the destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated /// differences of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hsubs_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_phsubsw128( + (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } /// Multiplies corresponding pairs of packed 8-bit unsigned integer @@ -525,10 +536,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b) /// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n /// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n /// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_maddubs_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a), + (__v16qi)__anyext128(__b))); } /// Multiplies packed 16-bit signed integer values, truncates the 32-bit @@ -565,10 +577,11 @@ _mm_mulhrs_epi16(__m128i __a, __m128i __b) /// A 64-bit vector of [4 x i16] containing one of the source operands. /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled /// products of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mulhrs_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// Copies the 8-bit integers from a 128-bit integer vector to the @@ -614,12 +627,15 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b) /// 1: Clear the corresponding byte in the destination. \n /// 0: Copy the selected source byte to the corresponding byte in the /// destination. \n -/// Bits [3:0] select the source byte to be copied. +/// Bits [2:0] select the source byte to be copied. /// \returns A 64-bit integer vector containing the copied or cleared values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_shuffle_pi8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_pshufb128( + (__v16qi)__builtin_shufflevector( + (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1), + (__v16qi)__anyext128(__b))); } /// For each 8-bit integer in the first source operand, perform one of @@ -720,10 +736,11 @@ _mm_sign_epi32(__m128i __a, __m128i __b) /// A 64-bit integer vector containing control bytes corresponding to /// positions in the destination. /// \returns A 64-bit integer vector containing the resultant values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sign_pi8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a), + (__v16qi)__anyext128(__b))); } /// For each 16-bit integer in the first source operand, perform one of @@ -746,10 +763,11 @@ _mm_sign_pi8(__m64 __a, __m64 __b) /// A 64-bit integer vector containing control words corresponding to /// positions in the destination. /// \returns A 64-bit integer vector containing the resultant values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sign_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// For each 32-bit integer in the first source operand, perform one of @@ -772,13 +790,15 @@ _mm_sign_pi16(__m64 __a, __m64 __b) /// A 64-bit integer vector containing two control doublewords corresponding /// to positions in the destination. /// \returns A 64-bit integer vector containing the resultant values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sign_pi32(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b); + return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a), + (__v4si)__anyext128(__b))); } +#undef __anyext128 +#undef __trunc64 #undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_MMX #endif /* __TMMINTRIN_H */ diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h index 1ef89de9c9f5..5b9e90e8f061 100644 --- a/clang/lib/Headers/xmmintrin.h +++ b/clang/lib/Headers/xmmintrin.h @@ -35,9 +35,21 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16))); #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \ __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS_MMX \ +#define __DEFAULT_FN_ATTRS_SSE2 \ __attribute__((__always_inline__, __nodebug__, \ - __target__("mmx,sse,no-evex512"), __min_vector_width__(64))) + __target__("sse2,no-evex512"), __min_vector_width__(128))) + +#define __trunc64(x) \ + (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0) +#define __zext128(x) \ + (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ + 1, 2, 3) +#define __anyext128(x) \ + (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ + 1, -1, -1) +#define __zeroupper64(x) \ + (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0, \ + 1, 4, 5) /// Adds the 32-bit float values in the low-order bits of the operands. /// @@ -1448,10 +1460,10 @@ _mm_cvtss_si64(__m128 __a) /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi32(__m128 __a) { - return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a); + return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a))); } /// Converts two low-order float values in a 128-bit vector of @@ -1468,7 +1480,7 @@ _mm_cvtps_pi32(__m128 __a) /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_ps2pi(__m128 __a) { return _mm_cvtps_pi32(__a); @@ -1558,10 +1570,10 @@ _mm_cvttss_si64(__m128 __a) /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvttps_pi32(__m128 __a) { - return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a); + return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a))); } /// Converts the lower (first) two elements of a 128-bit vector of [4 x float] @@ -1579,7 +1591,7 @@ _mm_cvttps_pi32(__m128 __a) /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtt_ps2pi(__m128 __a) { return _mm_cvttps_pi32(__a); @@ -1674,10 +1686,13 @@ _mm_cvtsi64_ss(__m128 __a, long long __b) /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the /// converted value of the second operand. The upper 64 bits are copied from /// the upper 64 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi32_ps(__m128 __a, __m64 __b) { - return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b); + return (__m128)__builtin_shufflevector( + (__v4sf)__a, + __builtin_convertvector((__v4si)__zext128(__b), __v4sf), + 4, 5, 2, 3); } /// Converts two elements of a 64-bit vector of [2 x i32] into two @@ -1697,7 +1712,7 @@ _mm_cvtpi32_ps(__m128 __a, __m64 __b) /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the /// converted value from the second operand. The upper 64 bits are copied /// from the upper 64 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_pi2ps(__m128 __a, __m64 __b) { return _mm_cvtpi32_ps(__a, __b); @@ -2231,10 +2246,10 @@ _mm_storer_ps(float *__p, __m128 __a) /// A pointer to an aligned memory location used to store the register value. /// \param __a /// A 64-bit integer containing the value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS_MMX +static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pi(void *__p, __m64 __a) { - __builtin_ia32_movntq((__m64 *)__p, __a); + __builtin_nontemporal_store(__a, (__m64 *)__p); } /// Moves packed float values from a 128-bit vector of [4 x float] to a @@ -2296,7 +2311,7 @@ void _mm_sfence(void); /// 3: Bits [63:48] are copied to the destination. /// \returns A 16-bit integer containing the extracted 16 bits of packed data. #define _mm_extract_pi16(a, n) \ - ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)) + ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)) /// Copies data from the 64-bit vector of [4 x i16] to the destination, /// and inserts the lower 16-bits of an integer operand at the 16-bit offset @@ -2342,10 +2357,10 @@ void _mm_sfence(void); /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_max_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); + return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b); } /// Compares each of the corresponding packed 8-bit unsigned integer @@ -2361,10 +2376,10 @@ _mm_max_pi16(__m64 __a, __m64 __b) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_max_pu8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); + return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b); } /// Compares each of the corresponding packed 16-bit integer values of @@ -2380,10 +2395,10 @@ _mm_max_pu8(__m64 __a, __m64 __b) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_min_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); + return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b); } /// Compares each of the corresponding packed 8-bit unsigned integer @@ -2399,10 +2414,10 @@ _mm_min_pi16(__m64 __a, __m64 __b) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_min_pu8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); + return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b); } /// Takes the most significant bit from each 8-bit element in a 64-bit @@ -2417,10 +2432,10 @@ _mm_min_pu8(__m64 __a, __m64 __b) /// A 64-bit integer vector containing the values with bits to be extracted. /// \returns The most significant bit from each 8-bit element in \a __a, /// written to bits [7:0]. -static __inline__ int __DEFAULT_FN_ATTRS_MMX +static __inline__ int __DEFAULT_FN_ATTRS_SSE2 _mm_movemask_pi8(__m64 __a) { - return __builtin_ia32_pmovmskb((__v8qi)__a); + return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a)); } /// Multiplies packed 16-bit unsigned integer values and writes the @@ -2436,10 +2451,11 @@ _mm_movemask_pi8(__m64 __a) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the products of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mulhi_pu16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the @@ -2476,8 +2492,10 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b) /// _MM_SHUFFLE(b6, b4, b2, b0) can create an 8-bit mask of the form /// [b6, b4, b2, b0]. /// \returns A 64-bit integer vector containing the shuffled values. -#define _mm_shuffle_pi16(a, n) \ - ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))) +#define _mm_shuffle_pi16(a, n) \ + ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \ + (n) & 0x3, ((n) >> 2) & 0x3, \ + ((n) >> 4) & 0x3, ((n) >> 6) & 0x3)) /// Conditionally copies the values from each 8-bit element in the first /// 64-bit integer vector operand to the specified memory location, as @@ -2502,10 +2520,25 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b) /// A pointer to a 64-bit memory location that will receive the conditionally /// copied integer values. The address of the memory location does not have /// to be aligned. -static __inline__ void __DEFAULT_FN_ATTRS_MMX +static __inline__ void __DEFAULT_FN_ATTRS_SSE2 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) { - __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); + // This is complex, because we need to support the case where __p is pointing + // within the last 15 to 8 bytes of a page. In that case, using a 128-bit + // write might cause a trap where a 64-bit maskmovq would not. (Memory + // locations not selected by the mask bits might still cause traps.) + __m128i __d128 = __anyext128(__d); + __m128i __n128 = __zext128(__n); + if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 && + ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) { + // If there's a risk of spurious trap due to a 128-bit write, back up the + // pointer by 8 bytes and shift values in registers to match. + __p -= 8; + __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8); + __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8); + } + + __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p); } /// Computes the rounded averages of the packed unsigned 8-bit integer @@ -2521,10 +2554,11 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the averages of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_avg_pu8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a), + (__v16qi)__anyext128(__b))); } /// Computes the rounded averages of the packed unsigned 16-bit integer @@ -2540,10 +2574,11 @@ _mm_avg_pu8(__m64 __a, __m64 __b) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the averages of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_avg_pu16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// Subtracts the corresponding 8-bit unsigned integer values of the two @@ -2562,10 +2597,11 @@ _mm_avg_pu16(__m64 __a, __m64 __b) /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the /// sets of absolute differences between both operands. The upper bits are /// cleared. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sad_pu8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a), + (__v16qi)__zext128(__b))); } #if defined(__cplusplus) @@ -2846,22 +2882,10 @@ _mm_movelh_ps(__m128 __a, __m128 __b) /// from the corresponding elements in this operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi16_ps(__m64 __a) { - __m64 __b, __c; - __m128 __r; - - __b = _mm_setzero_si64(); - __b = _mm_cmpgt_pi16(__b, __a); - __c = _mm_unpackhi_pi16(__a, __b); - __r = _mm_setzero_ps(); - __r = _mm_cvtpi32_ps(__r, __c); - __r = _mm_movelh_ps(__r, __r); - __c = _mm_unpacklo_pi16(__a, __b); - __r = _mm_cvtpi32_ps(__r, __c); - - return __r; + return __builtin_convertvector((__v4hi)__a, __v4sf); } /// Converts a 64-bit vector of 16-bit unsigned integer values into a @@ -2876,21 +2900,10 @@ _mm_cvtpi16_ps(__m64 __a) /// destination are copied from the corresponding elements in this operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpu16_ps(__m64 __a) { - __m64 __b, __c; - __m128 __r; - - __b = _mm_setzero_si64(); - __c = _mm_unpackhi_pi16(__a, __b); - __r = _mm_setzero_ps(); - __r = _mm_cvtpi32_ps(__r, __c); - __r = _mm_movelh_ps(__r, __r); - __c = _mm_unpacklo_pi16(__a, __b); - __r = _mm_cvtpi32_ps(__r, __c); - - return __r; + return __builtin_convertvector((__v4hu)__a, __v4sf); } /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] @@ -2905,16 +2918,12 @@ _mm_cvtpu16_ps(__m64 __a) /// from the corresponding lower 4 elements in this operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi8_ps(__m64 __a) { - __m64 __b; - - __b = _mm_setzero_si64(); - __b = _mm_cmpgt_pi8(__b, __a); - __b = _mm_unpacklo_pi8(__a, __b); - - return _mm_cvtpi16_ps(__b); + return __builtin_convertvector( + __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){}, + 0, 1, 2, 3), __v4sf); } /// Converts the lower four unsigned 8-bit integer values from a 64-bit @@ -2930,15 +2939,12 @@ _mm_cvtpi8_ps(__m64 __a) /// operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the source operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpu8_ps(__m64 __a) { - __m64 __b; - - __b = _mm_setzero_si64(); - __b = _mm_unpacklo_pi8(__a, __b); - - return _mm_cvtpi16_ps(__b); + return __builtin_convertvector( + __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){}, + 0, 1, 2, 3), __v4sf); } /// Converts the two 32-bit signed integer values from each 64-bit vector @@ -2957,16 +2963,12 @@ _mm_cvtpu8_ps(__m64 __a) /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the /// copied and converted values from the first operand. The upper 64 bits /// contain the copied and converted values from the second operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) { - __m128 __c; - - __c = _mm_setzero_ps(); - __c = _mm_cvtpi32_ps(__c, __b); - __c = _mm_movelh_ps(__c, __c); - - return _mm_cvtpi32_ps(__c, __a); + return __builtin_convertvector( + __builtin_shufflevector((__v2si)__a, (__v2si)__b, + 0, 1, 2, 3), __v4sf); } /// Converts each single-precision floating-point element of a 128-bit @@ -2986,16 +2988,11 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) /// A 128-bit floating-point vector of [4 x float]. /// \returns A 64-bit integer vector of [4 x i16] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi16(__m128 __a) { - __m64 __b, __c; - - __b = _mm_cvtps_pi32(__a); - __a = _mm_movehl_ps(__a, __a); - __c = _mm_cvtps_pi32(__a); - - return _mm_packs_pi32(__b, __c); + return __trunc64(__builtin_ia32_packssdw128( + (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps())); } /// Converts each single-precision floating-point element of a 128-bit @@ -3016,7 +3013,7 @@ _mm_cvtps_pi16(__m128 __a) /// 128-bit floating-point vector of [4 x float]. /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the /// converted values and the uppper 32 bits are set to zero. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi8(__m128 __a) { __m64 __b, __c; @@ -3196,8 +3193,12 @@ do { \ #define _m_psadbw _mm_sad_pu8 #define _m_ _mm_ +#undef __trunc64 +#undef __zext128 +#undef __anyext128 +#undef __zeroupper64 #undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_MMX +#undef __DEFAULT_FN_ATTRS_SSE2 /* Ugly hack for backwards-compatibility (compatible with gcc) */ #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics) diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp index be26454ce909..8f9057bbaf25 100644 --- a/clang/lib/Sema/SemaX86.cpp +++ b/clang/lib/Sema/SemaX86.cpp @@ -502,7 +502,6 @@ bool SemaX86::CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, switch (BuiltinID) { default: return false; - case X86::BI__builtin_ia32_vec_ext_v2si: case X86::BI__builtin_ia32_vec_ext_v2di: case X86::BI__builtin_ia32_vextractf128_pd256: case X86::BI__builtin_ia32_vextractf128_ps256: diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c index 5b5bc301bddc..495ae7e18115 100644 --- a/clang/test/CodeGen/X86/mmx-builtins.c +++ b/clang/test/CodeGen/X86/mmx-builtins.c @@ -1,193 +1,200 @@ -// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s -// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s --implicit-check-not=x86mmx +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --implicit-check-not=x86mmx #include __m64 test_mm_abs_pi8(__m64 a) { // CHECK-LABEL: test_mm_abs_pi8 - // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.b + // CHECK: call <8 x i8> @llvm.abs.v8i8( return _mm_abs_pi8(a); } __m64 test_mm_abs_pi16(__m64 a) { // CHECK-LABEL: test_mm_abs_pi16 - // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.w + // CHECK: call <4 x i16> @llvm.abs.v4i16( return _mm_abs_pi16(a); } __m64 test_mm_abs_pi32(__m64 a) { // CHECK-LABEL: test_mm_abs_pi32 - // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.d + // CHECK: call <2 x i32> @llvm.abs.v2i32( return _mm_abs_pi32(a); } __m64 test_mm_add_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_add_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.padd.b + // CHECK: add <8 x i8> {{%.*}}, {{%.*}} return _mm_add_pi8(a, b); } __m64 test_mm_add_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_add_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.padd.w + // CHECK: add <4 x i16> {{%.*}}, {{%.*}} return _mm_add_pi16(a, b); } __m64 test_mm_add_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_add_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.padd.d + // CHECK: add <2 x i32> {{%.*}}, {{%.*}} return _mm_add_pi32(a, b); } __m64 test_mm_add_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_add_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %{{.*}}, x86_mmx %{{.*}}) + // CHECK: add i64 {{%.*}}, {{%.*}} return _mm_add_si64(a, b); } __m64 test_mm_adds_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_adds_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.padds.b + // CHECK: call <8 x i8> @llvm.sadd.sat.v8i8( return _mm_adds_pi8(a, b); } __m64 test_mm_adds_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_adds_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.padds.w + // CHECK: call <4 x i16> @llvm.sadd.sat.v4i16( return _mm_adds_pi16(a, b); } __m64 test_mm_adds_pu8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_adds_pu8 - // CHECK: call x86_mmx @llvm.x86.mmx.paddus.b + // CHECK: call <8 x i8> @llvm.uadd.sat.v8i8( return _mm_adds_pu8(a, b); } __m64 test_mm_adds_pu16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_adds_pu16 - // CHECK: call x86_mmx @llvm.x86.mmx.paddus.w + // CHECK: call <4 x i16> @llvm.uadd.sat.v4i16( return _mm_adds_pu16(a, b); } __m64 test_mm_alignr_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_alignr_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.palignr.b + // CHECK: shufflevector <16 x i8> {{%.*}}, <16 x i8> zeroinitializer, <16 x i32> return _mm_alignr_pi8(a, b, 2); } __m64 test_mm_and_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_and_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.pand + // CHECK: and <1 x i64> {{%.*}}, {{%.*}} return _mm_and_si64(a, b); } __m64 test_mm_andnot_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_andnot_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.pandn + // CHECK: [[TMP:%.*]] = xor <1 x i64> {{%.*}}, + // CHECK: and <1 x i64> [[TMP]], {{%.*}} return _mm_andnot_si64(a, b); } __m64 test_mm_avg_pu8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_avg_pu8 - // CHECK: call x86_mmx @llvm.x86.mmx.pavg.b + // CHECK: call <16 x i8> @llvm.x86.sse2.pavg.b( return _mm_avg_pu8(a, b); } __m64 test_mm_avg_pu16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_avg_pu16 - // CHECK: call x86_mmx @llvm.x86.mmx.pavg.w + // CHECK: call <8 x i16> @llvm.x86.sse2.pavg.w( return _mm_avg_pu16(a, b); } __m64 test_mm_cmpeq_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_cmpeq_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.b + // CHECK: [[CMP:%.*]] = icmp eq <8 x i8> {{%.*}}, {{%.*}} + // CHECK-NEXT: {{%.*}} = sext <8 x i1> [[CMP]] to <8 x i8> return _mm_cmpeq_pi8(a, b); } __m64 test_mm_cmpeq_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_cmpeq_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.w + // CHECK: [[CMP:%.*]] = icmp eq <4 x i16> {{%.*}}, {{%.*}} + // CHECK-NEXT: {{%.*}} = sext <4 x i1> [[CMP]] to <4 x i16> return _mm_cmpeq_pi16(a, b); } __m64 test_mm_cmpeq_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_cmpeq_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.d + // CHECK: [[CMP:%.*]] = icmp eq <2 x i32> {{%.*}}, {{%.*}} + // CHECK-NEXT: {{%.*}} = sext <2 x i1> [[CMP]] to <2 x i32> return _mm_cmpeq_pi32(a, b); } __m64 test_mm_cmpgt_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_cmpgt_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.b + // CHECK: [[CMP:%.*]] = icmp sgt <8 x i8> {{%.*}}, {{%.*}} + // CHECK-NEXT: {{%.*}} = sext <8 x i1> [[CMP]] to <8 x i8> return _mm_cmpgt_pi8(a, b); } __m64 test_mm_cmpgt_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_cmpgt_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.w + // CHECK: [[CMP:%.*]] = icmp sgt <4 x i16> {{%.*}}, {{%.*}} + // CHECK-NEXT: {{%.*}} = sext <4 x i1> [[CMP]] to <4 x i16> return _mm_cmpgt_pi16(a, b); } __m64 test_mm_cmpgt_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_cmpgt_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.d + // CHECK: [[CMP:%.*]] = icmp sgt <2 x i32> {{%.*}}, {{%.*}} + // CHECK-NEXT: {{%.*}} = sext <2 x i1> [[CMP]] to <2 x i32> return _mm_cmpgt_pi32(a, b); } __m128 test_mm_cvt_pi2ps(__m128 a, __m64 b) { // CHECK-LABEL: test_mm_cvt_pi2ps - // CHECK: <4 x float> @llvm.x86.sse.cvtpi2ps + // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float> return _mm_cvt_pi2ps(a, b); } __m64 test_mm_cvt_ps2pi(__m128 a) { // CHECK-LABEL: test_mm_cvt_ps2pi - // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi + // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq( return _mm_cvt_ps2pi(a); } __m64 test_mm_cvtpd_pi32(__m128d a) { // CHECK-LABEL: test_mm_cvtpd_pi32 - // CHECK: call x86_mmx @llvm.x86.sse.cvtpd2pi + // CHECK: call <4 x i32> @llvm.x86.sse2.cvtpd2dq( return _mm_cvtpd_pi32(a); } __m128 test_mm_cvtpi16_ps(__m64 a) { // CHECK-LABEL: test_mm_cvtpi16_ps - // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps + // CHECK: sitofp <4 x i16> {{%.*}} to <4 x float> return _mm_cvtpi16_ps(a); } __m128d test_mm_cvtpi32_pd(__m64 a) { // CHECK-LABEL: test_mm_cvtpi32_pd - // CHECK: call <2 x double> @llvm.x86.sse.cvtpi2pd + // CHECK: sitofp <2 x i32> {{%.*}} to <2 x double> return _mm_cvtpi32_pd(a); } __m128 test_mm_cvtpi32_ps(__m128 a, __m64 b) { // CHECK-LABEL: test_mm_cvtpi32_ps - // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps + // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float> return _mm_cvtpi32_ps(a, b); } __m128 test_mm_cvtpi32x2_ps(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_cvtpi32x2_ps - // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps - // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps + // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float> return _mm_cvtpi32x2_ps(a, b); } __m64 test_mm_cvtps_pi16(__m128 a) { // CHECK-LABEL: test_mm_cvtps_pi16 - // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi + // CHECK: [[TMP0:%.*]] = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> {{%.*}}) + // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP0]], return _mm_cvtps_pi16(a); } __m64 test_mm_cvtps_pi32(__m128 a) { // CHECK-LABEL: test_mm_cvtps_pi32 - // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi + // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq( return _mm_cvtps_pi32(a); } @@ -205,19 +212,19 @@ int test_mm_cvtsi64_si32(__m64 a) { __m64 test_mm_cvttpd_pi32(__m128d a) { // CHECK-LABEL: test_mm_cvttpd_pi32 - // CHECK: call x86_mmx @llvm.x86.sse.cvttpd2pi + // CHECK: call <4 x i32> @llvm.x86.sse2.cvttpd2dq( return _mm_cvttpd_pi32(a); } __m64 test_mm_cvttps_pi32(__m128 a) { // CHECK-LABEL: test_mm_cvttps_pi32 - // CHECK: call x86_mmx @llvm.x86.sse.cvttps2pi + // CHECK: call <4 x i32> @llvm.x86.sse2.cvttps2dq( return _mm_cvttps_pi32(a); } int test_mm_extract_pi16(__m64 a) { // CHECK-LABEL: test_mm_extract_pi16 - // CHECK: call i32 @llvm.x86.mmx.pextr.w + // CHECK: extractelement <4 x i16> {{%.*}}, i64 2 return _mm_extract_pi16(a, 2); } @@ -234,151 +241,153 @@ __m64 test_m_from_int64(long long a) { __m64 test_mm_hadd_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hadd_pi16 - // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.w + // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128( return _mm_hadd_pi16(a, b); } __m64 test_mm_hadd_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hadd_pi32 - // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.d + // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128( return _mm_hadd_pi32(a, b); } __m64 test_mm_hadds_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hadds_pi16 - // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.sw + // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128( return _mm_hadds_pi16(a, b); } __m64 test_mm_hsub_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hsub_pi16 - // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.w + // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128( return _mm_hsub_pi16(a, b); } __m64 test_mm_hsub_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hsub_pi32 - // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.d + // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128( return _mm_hsub_pi32(a, b); } __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hsubs_pi16 - // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.sw + // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128( return _mm_hsubs_pi16(a, b); } __m64 test_mm_insert_pi16(__m64 a, int d) { // CHECK-LABEL: test_mm_insert_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pinsr.w + // CHECK: insertelement <4 x i16> return _mm_insert_pi16(a, d, 2); } __m64 test_mm_madd_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_madd_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pmadd.wd + // CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd( return _mm_madd_pi16(a, b); } __m64 test_mm_maddubs_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_maddubs_pi16 - // CHECK: call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw + // CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128( return _mm_maddubs_pi16(a, b); } void test_mm_maskmove_si64(__m64 d, __m64 n, char *p) { // CHECK-LABEL: test_mm_maskmove_si64 - // CHECK: call void @llvm.x86.mmx.maskmovq + // CHECK: call void @llvm.x86.sse2.maskmov.dqu( _mm_maskmove_si64(d, n, p); } __m64 test_mm_max_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_max_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pmaxs.w + // CHECK: call <4 x i16> @llvm.smax.v4i16( return _mm_max_pi16(a, b); } __m64 test_mm_max_pu8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_max_pu8 - // CHECK: call x86_mmx @llvm.x86.mmx.pmaxu.b + // CHECK: call <8 x i8> @llvm.umax.v8i8( return _mm_max_pu8(a, b); } __m64 test_mm_min_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_min_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pmins.w + // CHECK: call <4 x i16> @llvm.smin.v4i16( return _mm_min_pi16(a, b); } __m64 test_mm_min_pu8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_min_pu8 - // CHECK: call x86_mmx @llvm.x86.mmx.pminu.b + // CHECK: call <8 x i8> @llvm.umin.v8i8( return _mm_min_pu8(a, b); } int test_mm_movemask_pi8(__m64 a) { // CHECK-LABEL: test_mm_movemask_pi8 - // CHECK: call i32 @llvm.x86.mmx.pmovmskb + // CHECK: call i32 @llvm.x86.sse2.pmovmskb.128( return _mm_movemask_pi8(a); } __m64 test_mm_mul_su32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_mul_su32 - // CHECK: call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %{{.*}}, x86_mmx %{{.*}}) + // CHECK: and <2 x i64> {{%.*}}, + // CHECK: and <2 x i64> {{%.*}}, + // CHECK: mul <2 x i64> %{{.*}}, %{{.*}} return _mm_mul_su32(a, b); } __m64 test_mm_mulhi_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_mulhi_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pmulh.w + // CHECK: call <8 x i16> @llvm.x86.sse2.pmulh.w( return _mm_mulhi_pi16(a, b); } __m64 test_mm_mulhi_pu16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_mulhi_pu16 - // CHECK: call x86_mmx @llvm.x86.mmx.pmulhu.w + // CHECK: call <8 x i16> @llvm.x86.sse2.pmulhu.w( return _mm_mulhi_pu16(a, b); } __m64 test_mm_mulhrs_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_mulhrs_pi16 - // CHECK: call x86_mmx @llvm.x86.ssse3.pmul.hr.sw + // CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128( return _mm_mulhrs_pi16(a, b); } __m64 test_mm_mullo_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_mullo_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pmull.w + // CHECK: mul <4 x i16> {{%.*}}, {{%.*}} return _mm_mullo_pi16(a, b); } __m64 test_mm_or_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_or_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.por + // CHECK: or <1 x i64> {{%.*}}, {{%.*}} return _mm_or_si64(a, b); } __m64 test_mm_packs_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_packs_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.packsswb + // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128( return _mm_packs_pi16(a, b); } __m64 test_mm_packs_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_packs_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.packssdw + // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128( return _mm_packs_pi32(a, b); } __m64 test_mm_packs_pu16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_packs_pu16 - // CHECK: call x86_mmx @llvm.x86.mmx.packuswb + // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128( return _mm_packs_pu16(a, b); } __m64 test_mm_sad_pu8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sad_pu8 - // CHECK: call x86_mmx @llvm.x86.mmx.psad.bw + // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> return _mm_sad_pu8(a, b); } @@ -471,187 +480,187 @@ __m64 test_mm_set1_pi32(int a) { __m64 test_mm_shuffle_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_shuffle_pi8 - // CHECK: call x86_mmx @llvm.x86.ssse3.pshuf.b + // CHECK: call <16 x i8> @llvm.x86.ssse3.pshuf.b.128( return _mm_shuffle_pi8(a, b); } __m64 test_mm_shuffle_pi16(__m64 a) { // CHECK-LABEL: test_mm_shuffle_pi16 - // CHECK: call x86_mmx @llvm.x86.sse.pshuf.w + // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> return _mm_shuffle_pi16(a, 3); } __m64 test_mm_sign_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sign_pi8 - // CHECK: call x86_mmx @llvm.x86.ssse3.psign.b + // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128( return _mm_sign_pi8(a, b); } __m64 test_mm_sign_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sign_pi16 - // CHECK: call x86_mmx @llvm.x86.ssse3.psign.w + // CHECK: call <8 x i16> @llvm.x86.ssse3.psign.w.128( return _mm_sign_pi16(a, b); } __m64 test_mm_sign_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sign_pi32 - // CHECK: call x86_mmx @llvm.x86.ssse3.psign.d + // CHECK: call <4 x i32> @llvm.x86.ssse3.psign.d.128( return _mm_sign_pi32(a, b); } __m64 test_mm_sll_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sll_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.psll.w + // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w( return _mm_sll_pi16(a, b); } __m64 test_mm_sll_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sll_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.psll.d + // CHECK: call <4 x i32> @llvm.x86.sse2.psll.d( return _mm_sll_pi32(a, b); } __m64 test_mm_sll_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sll_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.psll.q + // CHECK: call <2 x i64> @llvm.x86.sse2.psll.q( return _mm_sll_si64(a, b); } __m64 test_mm_slli_pi16(__m64 a) { // CHECK-LABEL: test_mm_slli_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pslli.w + // CHECK: call <8 x i16> @llvm.x86.sse2.pslli.w( return _mm_slli_pi16(a, 3); } __m64 test_mm_slli_pi32(__m64 a) { // CHECK-LABEL: test_mm_slli_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.pslli.d + // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d( return _mm_slli_pi32(a, 3); } __m64 test_mm_slli_si64(__m64 a) { // CHECK-LABEL: test_mm_slli_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.pslli.q + // CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q( return _mm_slli_si64(a, 3); } __m64 test_mm_sra_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sra_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.psra.w + // CHECK: call <8 x i16> @llvm.x86.sse2.psra.w( return _mm_sra_pi16(a, b); } __m64 test_mm_sra_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sra_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.psra.d + // CHECK: call <4 x i32> @llvm.x86.sse2.psra.d( return _mm_sra_pi32(a, b); } __m64 test_mm_srai_pi16(__m64 a) { // CHECK-LABEL: test_mm_srai_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.psrai.w + // CHECK: call <8 x i16> @llvm.x86.sse2.psrai.w( return _mm_srai_pi16(a, 3); } __m64 test_mm_srai_pi32(__m64 a) { // CHECK-LABEL: test_mm_srai_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.psrai.d + // CHECK: call <4 x i32> @llvm.x86.sse2.psrai.d( return _mm_srai_pi32(a, 3); } __m64 test_mm_srl_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_srl_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.psrl.w + // CHECK: call <8 x i16> @llvm.x86.sse2.psrl.w( return _mm_srl_pi16(a, b); } __m64 test_mm_srl_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_srl_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.psrl.d + // CHECK: call <4 x i32> @llvm.x86.sse2.psrl.d( return _mm_srl_pi32(a, b); } __m64 test_mm_srl_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_srl_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.psrl.q + // CHECK: call <2 x i64> @llvm.x86.sse2.psrl.q( return _mm_srl_si64(a, b); } __m64 test_mm_srli_pi16(__m64 a) { // CHECK-LABEL: test_mm_srli_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.psrli.w + // CHECK: call <8 x i16> @llvm.x86.sse2.psrli.w( return _mm_srli_pi16(a, 3); } __m64 test_mm_srli_pi32(__m64 a) { // CHECK-LABEL: test_mm_srli_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.psrli.d + // CHECK: call <4 x i32> @llvm.x86.sse2.psrli.d( return _mm_srli_pi32(a, 3); } __m64 test_mm_srli_si64(__m64 a) { // CHECK-LABEL: test_mm_srli_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.psrli.q + // CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q( return _mm_srli_si64(a, 3); } void test_mm_stream_pi(__m64 *p, __m64 a) { // CHECK-LABEL: test_mm_stream_pi - // CHECK: call void @llvm.x86.mmx.movnt.dq + // CHECK: store <1 x i64> {{%.*}}, ptr {{%.*}}, align 8, !nontemporal _mm_stream_pi(p, a); } void test_mm_stream_pi_void(void *p, __m64 a) { // CHECK-LABEL: test_mm_stream_pi_void - // CHECK: call void @llvm.x86.mmx.movnt.dq + // CHECK: store <1 x i64> {{%.*}}, ptr {{%.*}}, align 8, !nontemporal _mm_stream_pi(p, a); } __m64 test_mm_sub_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sub_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.psub.b + // CHECK: sub <8 x i8> {{%.*}}, {{%.*}} return _mm_sub_pi8(a, b); } __m64 test_mm_sub_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sub_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.psub.w + // CHECK: sub <4 x i16> {{%.*}}, {{%.*}} return _mm_sub_pi16(a, b); } __m64 test_mm_sub_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sub_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.psub.d + // CHECK: sub <2 x i32> {{%.*}}, {{%.*}} return _mm_sub_pi32(a, b); } __m64 test_mm_sub_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sub_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %{{.*}}, x86_mmx %{{.*}}) + // CHECK: sub i64 {{%.*}}, {{%.*}} return _mm_sub_si64(a, b); } __m64 test_mm_subs_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_subs_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.psubs.b + // CHECK: call <8 x i8> @llvm.ssub.sat.v8i8( return _mm_subs_pi8(a, b); } __m64 test_mm_subs_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_subs_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.psubs.w + // CHECK: call <4 x i16> @llvm.ssub.sat.v4i16( return _mm_subs_pi16(a, b); } __m64 test_mm_subs_pu8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_subs_pu8 - // CHECK: call x86_mmx @llvm.x86.mmx.psubus.b + // CHECK: call <8 x i8> @llvm.usub.sat.v8i8( return _mm_subs_pu8(a, b); } __m64 test_mm_subs_pu16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_subs_pu16 - // CHECK: call x86_mmx @llvm.x86.mmx.psubus.w + // CHECK: call <4 x i16> @llvm.usub.sat.v4i16( return _mm_subs_pu16(a, b); } @@ -668,42 +677,42 @@ long long test_m_to_int64(__m64 a) { __m64 test_mm_unpackhi_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_unpackhi_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.punpckhbw + // CHECK: shufflevector <8 x i8> {{%.*}}, <8 x i8> {{%.*}}, <8 x i32> return _mm_unpackhi_pi8(a, b); } __m64 test_mm_unpackhi_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_unpackhi_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.punpckhwd + // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> return _mm_unpackhi_pi16(a, b); } __m64 test_mm_unpackhi_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_unpackhi_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.punpckhdq + // CHECK: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> return _mm_unpackhi_pi32(a, b); } __m64 test_mm_unpacklo_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_unpacklo_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.punpcklbw + // CHECK: shufflevector <8 x i8> {{%.*}}, <8 x i8> {{%.*}}, <8 x i32> return _mm_unpacklo_pi8(a, b); } __m64 test_mm_unpacklo_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_unpacklo_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.punpcklwd + // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> return _mm_unpacklo_pi16(a, b); } __m64 test_mm_unpacklo_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_unpacklo_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.punpckldq + // CHECK: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> return _mm_unpacklo_pi32(a, b); } __m64 test_mm_xor_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_xor_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.pxor + // CHECK: xor <1 x i64> {{%.*}}, {{%.*}} return _mm_xor_si64(a, b); } diff --git a/clang/test/CodeGen/X86/mmx-inline-asm.c b/clang/test/CodeGen/X86/mmx-inline-asm.c index a0702c7f780d..17fce1a48755 100644 --- a/clang/test/CodeGen/X86/mmx-inline-asm.c +++ b/clang/test/CodeGen/X86/mmx-inline-asm.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +mmx %s -o - | FileCheck %s +// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +mmx -target-feature +sse2 %s -o - | FileCheck %s #include // CHECK: { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } diff --git a/clang/test/CodeGen/X86/mmx-shift-with-immediate.c b/clang/test/CodeGen/X86/mmx-shift-with-immediate.c index 83be6b5517c0..741cb9c9c5ec 100644 --- a/clang/test/CodeGen/X86/mmx-shift-with-immediate.c +++ b/clang/test/CodeGen/X86/mmx-shift-with-immediate.c @@ -1,23 +1,23 @@ -// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +mmx %s -o - | FileCheck %s +// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +sse2 %s -o - | FileCheck %s #include void shift(__m64 a, __m64 b, int c) { - // CHECK: x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %{{.*}}, i32 {{.*}}) + // CHECK: <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %{{.*}}, i32 {{.*}}) _mm_slli_pi16(a, c); - // CHECK: x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %{{.*}}, i32 {{.*}}) + // CHECK: <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 {{.*}}) _mm_slli_pi32(a, c); - // CHECK: x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %{{.*}}, i32 {{.*}}) + // CHECK: <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %{{.*}}, i32 {{.*}}) _mm_slli_si64(a, c); - // CHECK: x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %{{.*}}, i32 {{.*}}) + // CHECK: <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %{{.*}}, i32 {{.*}}) _mm_srli_pi16(a, c); - // CHECK: x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %{{.*}}, i32 {{.*}}) + // CHECK: <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %{{.*}}, i32 {{.*}}) _mm_srli_pi32(a, c); - // CHECK: x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %{{.*}}, i32 {{.*}}) + // CHECK: <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %{{.*}}, i32 {{.*}}) _mm_srli_si64(a, c); - // CHECK: x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %{{.*}}, i32 {{.*}}) + // CHECK: <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %{{.*}}, i32 {{.*}}) _mm_srai_pi16(a, c); - // CHECK: x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %{{.*}}, i32 {{.*}}) + // CHECK: <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %{{.*}}, i32 {{.*}}) _mm_srai_pi32(a, c); } diff --git a/clang/test/CodeGen/attr-target-x86-mmx.c b/clang/test/CodeGen/attr-target-x86-mmx.c index 01663766d984..39b26619475a 100644 --- a/clang/test/CodeGen/attr-target-x86-mmx.c +++ b/clang/test/CodeGen/attr-target-x86-mmx.c @@ -1,12 +1,11 @@ // RUN: %clang_cc1 -triple i386-linux-gnu -emit-llvm %s -o - | FileCheck %s -// Picking a cpu that doesn't have mmx or sse by default so we can enable it later. +// Picking a cpu that doesn't have sse by default so we can enable it later. #define __MM_MALLOC_H #include -// Verify that when we turn on sse that we also turn on mmx. -void __attribute__((target("sse"))) shift(__m64 a, __m64 b, int c) { +void __attribute__((target("sse2"))) shift(__m64 a, __m64 b, int c) { _mm_slli_pi16(a, c); _mm_slli_pi32(a, c); _mm_slli_si64(a, c); @@ -19,4 +18,4 @@ void __attribute__((target("sse"))) shift(__m64 a, __m64 b, int c) { _mm_srai_pi32(a, c); } -// CHECK: "target-features"="+cx8,+mmx,+sse,+x87" +// CHECK: "target-features"="+cx8,+mmx,+sse,+sse2,+x87" diff --git a/clang/test/CodeGen/builtins-x86.c b/clang/test/CodeGen/builtins-x86.c index de31a4db5b0c..c42c3216ec53 100644 --- a/clang/test/CodeGen/builtins-x86.c +++ b/clang/test/CodeGen/builtins-x86.c @@ -168,26 +168,6 @@ void f0(void) { tmp_V4f = __builtin_ia32_minss(tmp_V4f, tmp_V4f); tmp_V4f = __builtin_ia32_maxss(tmp_V4f, tmp_V4f); - tmp_V8c = __builtin_ia32_paddsb(tmp_V8c, tmp_V8c); - tmp_V4s = __builtin_ia32_paddsw(tmp_V4s, tmp_V4s); - tmp_V8c = __builtin_ia32_psubsb(tmp_V8c, tmp_V8c); - tmp_V4s = __builtin_ia32_psubsw(tmp_V4s, tmp_V4s); - tmp_V8c = __builtin_ia32_paddusb(tmp_V8c, tmp_V8c); - tmp_V4s = __builtin_ia32_paddusw(tmp_V4s, tmp_V4s); - tmp_V8c = __builtin_ia32_psubusb(tmp_V8c, tmp_V8c); - tmp_V4s = __builtin_ia32_psubusw(tmp_V4s, tmp_V4s); - tmp_V4s = __builtin_ia32_pmulhw(tmp_V4s, tmp_V4s); - tmp_V4s = __builtin_ia32_pmulhuw(tmp_V4s, tmp_V4s); - tmp_V8c = __builtin_ia32_pcmpeqb(tmp_V8c, tmp_V8c); - tmp_V4s = __builtin_ia32_pcmpeqw(tmp_V4s, tmp_V4s); - tmp_V2i = __builtin_ia32_pcmpeqd(tmp_V2i, tmp_V2i); - tmp_V8c = __builtin_ia32_pcmpgtb(tmp_V8c, tmp_V8c); - tmp_V4s = __builtin_ia32_pcmpgtw(tmp_V4s, tmp_V4s); - tmp_V2i = __builtin_ia32_pcmpgtd(tmp_V2i, tmp_V2i); - tmp_V8c = __builtin_ia32_pmaxub(tmp_V8c, tmp_V8c); - tmp_V4s = __builtin_ia32_pmaxsw(tmp_V4s, tmp_V4s); - tmp_V8c = __builtin_ia32_pminub(tmp_V8c, tmp_V8c); - tmp_V4s = __builtin_ia32_pminsw(tmp_V4s, tmp_V4s); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 0); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 1); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 2); @@ -220,45 +200,17 @@ void f0(void) { tmp_V4f = __builtin_ia32_hsubps(tmp_V4f, tmp_V4f); tmp_V2d = __builtin_ia32_hsubpd(tmp_V2d, tmp_V2d); tmp_V8s = __builtin_ia32_phaddw128(tmp_V8s, tmp_V8s); - tmp_V4s = __builtin_ia32_phaddw(tmp_V4s, tmp_V4s); tmp_V4i = __builtin_ia32_phaddd128(tmp_V4i, tmp_V4i); - tmp_V2i = __builtin_ia32_phaddd(tmp_V2i, tmp_V2i); tmp_V8s = __builtin_ia32_phaddsw128(tmp_V8s, tmp_V8s); - tmp_V4s = __builtin_ia32_phaddsw(tmp_V4s, tmp_V4s); tmp_V8s = __builtin_ia32_phsubw128(tmp_V8s, tmp_V8s); - tmp_V4s = __builtin_ia32_phsubw(tmp_V4s, tmp_V4s); tmp_V4i = __builtin_ia32_phsubd128(tmp_V4i, tmp_V4i); - tmp_V2i = __builtin_ia32_phsubd(tmp_V2i, tmp_V2i); tmp_V8s = __builtin_ia32_phsubsw128(tmp_V8s, tmp_V8s); - tmp_V4s = __builtin_ia32_phsubsw(tmp_V4s, tmp_V4s); tmp_V8s = __builtin_ia32_pmaddubsw128(tmp_V16c, tmp_V16c); - tmp_V8c = __builtin_ia32_pmaddubsw(tmp_V8c, tmp_V8c); tmp_V8s = __builtin_ia32_pmulhrsw128(tmp_V8s, tmp_V8s); - tmp_V4s = __builtin_ia32_pmulhrsw(tmp_V4s, tmp_V4s); tmp_V16c = __builtin_ia32_pshufb128(tmp_V16c, tmp_V16c); - tmp_V8c = __builtin_ia32_pshufb(tmp_V8c, tmp_V8c); tmp_V16c = __builtin_ia32_psignb128(tmp_V16c, tmp_V16c); - tmp_V8c = __builtin_ia32_psignb(tmp_V8c, tmp_V8c); tmp_V8s = __builtin_ia32_psignw128(tmp_V8s, tmp_V8s); - tmp_V4s = __builtin_ia32_psignw(tmp_V4s, tmp_V4s); tmp_V4i = __builtin_ia32_psignd128(tmp_V4i, tmp_V4i); - tmp_V2i = __builtin_ia32_psignd(tmp_V2i, tmp_V2i); - tmp_V8c = __builtin_ia32_pabsb(tmp_V8c); - tmp_V4s = __builtin_ia32_pabsw(tmp_V4s); - tmp_V2i = __builtin_ia32_pabsd(tmp_V2i); - tmp_V4s = __builtin_ia32_psllw(tmp_V4s, tmp_V1LLi); - tmp_V2i = __builtin_ia32_pslld(tmp_V2i, tmp_V1LLi); - tmp_V1LLi = __builtin_ia32_psllq(tmp_V1LLi, tmp_V1LLi); - tmp_V4s = __builtin_ia32_psrlw(tmp_V4s, tmp_V1LLi); - tmp_V2i = __builtin_ia32_psrld(tmp_V2i, tmp_V1LLi); - tmp_V1LLi = __builtin_ia32_psrlq(tmp_V1LLi, tmp_V1LLi); - tmp_V4s = __builtin_ia32_psraw(tmp_V4s, tmp_V1LLi); - tmp_V2i = __builtin_ia32_psrad(tmp_V2i, tmp_V1LLi); - tmp_V2i = __builtin_ia32_pmaddwd(tmp_V4s, tmp_V4s); - tmp_V8c = __builtin_ia32_packsswb(tmp_V4s, tmp_V4s); - tmp_V4s = __builtin_ia32_packssdw(tmp_V2i, tmp_V2i); - tmp_V8c = __builtin_ia32_packuswb(tmp_V4s, tmp_V4s); - tmp_i = __builtin_ia32_vec_ext_v2si(tmp_V2i, 0); __builtin_ia32_incsspd(tmp_Ui); __builtin_ia32_incsspq(tmp_ULLi); @@ -306,8 +258,6 @@ void f0(void) { (void) __builtin_ia32_clzero(tmp_vp); (void) __builtin_ia32_cldemote(tmp_vp); - tmp_V4f = __builtin_ia32_cvtpi2ps(tmp_V4f, tmp_V2i); - tmp_V2i = __builtin_ia32_cvtps2pi(tmp_V4f); tmp_i = __builtin_ia32_cvtss2si(tmp_V4f); tmp_i = __builtin_ia32_cvttss2si(tmp_V4f); @@ -320,17 +270,12 @@ void f0(void) { tmp_LLi = __builtin_ia32_cvtss2si64(tmp_V4f); tmp_LLi = __builtin_ia32_cvttss2si64(tmp_V4f); #endif - tmp_V2i = __builtin_ia32_cvttps2pi(tmp_V4f); - (void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp); tmp_i = __builtin_ia32_movmskps(tmp_V4f); - tmp_i = __builtin_ia32_pmovmskb(tmp_V8c); - (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi); (void) __builtin_ia32_sfence(); #ifndef OPENCL (void) _mm_sfence(); #endif - tmp_V4s = __builtin_ia32_psadbw(tmp_V8c, tmp_V8c); tmp_V4f = __builtin_ia32_rcpps(tmp_V4f); tmp_V4f = __builtin_ia32_rcpss(tmp_V4f); tmp_V4f = __builtin_ia32_rsqrtps(tmp_V4f); @@ -348,11 +293,8 @@ void f0(void) { tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d); tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d); tmp_V2LLi = __builtin_ia32_cvtpd2dq(tmp_V2d); - tmp_V2i = __builtin_ia32_cvtpd2pi(tmp_V2d); tmp_V4f = __builtin_ia32_cvtpd2ps(tmp_V2d); tmp_V4i = __builtin_ia32_cvttpd2dq(tmp_V2d); - tmp_V2i = __builtin_ia32_cvttpd2pi(tmp_V2d); - tmp_V2d = __builtin_ia32_cvtpi2pd(tmp_V2i); tmp_i = __builtin_ia32_cvtsd2si(tmp_V2d); tmp_i = __builtin_ia32_cvttsd2si(tmp_V2d); tmp_V4f = __builtin_ia32_cvtsd2ss(tmp_V4f, tmp_V2d); @@ -379,26 +321,9 @@ void f0(void) { (void) _mm_pause(); #endif - tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, imm_i_0_8); - tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, imm_i_0_8); - tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, imm_i_0_8); - tmp_V4s = __builtin_ia32_psrawi(tmp_V4s, imm_i_0_8); - tmp_V2i = __builtin_ia32_psradi(tmp_V2i, imm_i_0_8); - tmp_V4s = __builtin_ia32_psrlwi(tmp_V4s, imm_i_0_8); - tmp_V2i = __builtin_ia32_psrldi(tmp_V2i, imm_i_0_8); - tmp_V1LLi = __builtin_ia32_psrlqi(tmp_V1LLi, imm_i_0_8); // Using non-immediate argument supported for gcc compatibility - tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, tmp_i); - tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, tmp_i); - tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, tmp_i); - tmp_V4s = __builtin_ia32_psrawi(tmp_V4s, tmp_i); - tmp_V2i = __builtin_ia32_psradi(tmp_V2i, tmp_i); - tmp_V4s = __builtin_ia32_psrlwi(tmp_V4s, tmp_i); - tmp_V2i = __builtin_ia32_psrldi(tmp_V2i, tmp_i); - tmp_V1LLi = __builtin_ia32_psrlqi(tmp_V1LLi, tmp_i); - tmp_V1LLi = __builtin_ia32_pmuludq(tmp_V2i, tmp_V2i); tmp_V2LLi = __builtin_ia32_pmuludq128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_psraw128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_psrad128(tmp_V4i, tmp_V4i); @@ -433,7 +358,6 @@ void f0(void) { (void) __builtin_ia32_mwait(tmp_Ui, tmp_Ui); tmp_V16c = __builtin_ia32_lddqu(tmp_cCp); tmp_V16c = __builtin_ia32_palignr128(tmp_V16c, tmp_V16c, imm_i); - tmp_V8c = __builtin_ia32_palignr(tmp_V8c, tmp_V8c, imm_i); #ifdef USE_SSE4 tmp_V16c = __builtin_ia32_pblendvb128(tmp_V16c, tmp_V16c, tmp_V16c); tmp_V2d = __builtin_ia32_blendvpd(tmp_V2d, tmp_V2d, tmp_V2d); diff --git a/clang/test/CodeGen/palignr.c b/clang/test/CodeGen/palignr.c index 5a77597c3403..092937ac115d 100644 --- a/clang/test/CodeGen/palignr.c +++ b/clang/test/CodeGen/palignr.c @@ -14,18 +14,3 @@ int4 align2(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 16); } int4 align3(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 17); } // CHECK: xor int4 align4(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 32); } - -#define _mm_alignr_pi8(a, b, n) (__builtin_ia32_palignr((a), (b), (n))) -typedef __attribute__((vector_size(8))) int int2; - -// CHECK: palignr -int2 align5(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 8); } - -// CHECK: palignr -int2 align6(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 9); } - -// CHECK: palignr -int2 align7(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 16); } - -// CHECK: palignr -int2 align8(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 7); } diff --git a/clang/test/CodeGen/pr26099.c b/clang/test/CodeGen/pr26099.c deleted file mode 100644 index 15b73b832e9d..000000000000 --- a/clang/test/CodeGen/pr26099.c +++ /dev/null @@ -1,12 +0,0 @@ -// RUN: %clang_cc1 -ffreestanding %s -triple=i686-apple-darwin -target-feature +mmx -emit-llvm -o - -Wall -Werror -// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +mmx -emit-llvm -o - -Wall -Werror -// REQUIRES: asserts - -#include - -int __attribute__ ((__vector_size__ (8))) b; - -void bar(int a) -{ - b = __builtin_ia32_vec_init_v2si (0, a); -} \ No newline at end of file diff --git a/clang/test/Headers/xmmintrin.c b/clang/test/Headers/xmmintrin.c index a75b3380368c..15e4a431df65 100644 --- a/clang/test/Headers/xmmintrin.c +++ b/clang/test/Headers/xmmintrin.c @@ -14,7 +14,7 @@ _MM_ALIGN16 char c; // checking that clang emits PACKSSDW instead of PACKSSWB. // CHECK: define{{.*}} i64 @test_mm_cvtps_pi16 -// CHECK: call x86_mmx @llvm.x86.mmx.packssdw +// CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128 __m64 test_mm_cvtps_pi16(__m128 a) { return _mm_cvtps_pi16(a); diff --git a/clang/test/Sema/x86-builtin-palignr.c b/clang/test/Sema/x86-builtin-palignr.c index e055cbb70e9e..33a963c15b00 100644 --- a/clang/test/Sema/x86-builtin-palignr.c +++ b/clang/test/Sema/x86-builtin-palignr.c @@ -4,5 +4,5 @@ #include __m64 test1(__m64 a, __m64 b, int c) { - return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_palignr' must be a constant integer}} + return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_psrldqi128_byteshift' must be a constant integer}} } diff --git a/clang/www/builtins.py b/clang/www/builtins.py index 0c2e181a8cfa..849e6bd4a816 100755 --- a/clang/www/builtins.py +++ b/clang/www/builtins.py @@ -4,8 +4,9 @@ import sys, fileinput err = 0 -# Giant associative set of builtin->intrinsic mappings where clang doesn't -# implement the builtin since the vector operation works by default. +# Giant associative set of builtin->intrinsic mappings where clang +# doesn't implement the builtin. (Either because the vector operation +# works without a builtin, or for other reasons.) repl_map = { "__builtin_ia32_addps": "_mm_add_ps", @@ -134,6 +135,99 @@ repl_map = { "__builtin_ia32_vec_ext_v2di": "_mm_extract_epi64", "__builtin_ia32_vec_ext_v4hi": "_mm_extract_pi16", "__builtin_ia32_vec_ext_v4sf": "_mm_extract_ps", + # Removed MMX builtins + "__builtin_ia32_paddb": "_mm_add_pi8", + "__builtin_ia32_paddw": "_mm_add_pi16", + "__builtin_ia32_paddd": "_mm_add_pi32", + "__builtin_ia32_paddsb": "_mm_adds_pi8", + "__builtin_ia32_paddsw": "_mm_adds_pi16", + "__builtin_ia32_paddusb": "_mm_adds_pu8", + "__builtin_ia32_paddusw": "_mm_adds_pu16", + "__builtin_ia32_psubb": "_mm_sub_pi8", + "__builtin_ia32_psubw": "_mm_sub_pi16", + "__builtin_ia32_psubd": "_mm_sub_pi32", + "__builtin_ia32_psubsb": "_mm_subs_pi8", + "__builtin_ia32_psubsw": "_mm_subs_pi16", + "__builtin_ia32_psubusb": "_mm_subs_pu8", + "__builtin_ia32_psubusw": "_mm_subs_pu16", + "__builtin_ia32_pmulhw": "_mm_mulhi_pi16", + "__builtin_ia32_pmullw": "_mm_mullo_pi16", + "__builtin_ia32_pmaddwd": "_mm_madd_pi16", + "__builtin_ia32_pand": "_mm_and_si64", + "__builtin_ia32_pandn": "_mm_andnot_si64", + "__builtin_ia32_por": "_mm_or_si64", + "__builtin_ia32_pxor": "_mm_xor_si64", + "__builtin_ia32_psllw": "_mm_sll_pi16", + "__builtin_ia32_pslld": "_mm_sll_pi32", + "__builtin_ia32_psllq": "_mm_sll_si64", + "__builtin_ia32_psrlw": "_mm_srl_pi16", + "__builtin_ia32_psrld": "_mm_srl_pi32", + "__builtin_ia32_psrlq": "_mm_srl_si64", + "__builtin_ia32_psraw": "_mm_sra_pi16", + "__builtin_ia32_psrad": "_mm_sra_pi32", + "__builtin_ia32_psllwi": "_mm_slli_pi16", + "__builtin_ia32_pslldi": "_mm_slli_pi32", + "__builtin_ia32_psllqi": "_mm_slli_si64", + "__builtin_ia32_psrlwi": "_mm_srli_pi16", + "__builtin_ia32_psrldi": "_mm_srli_pi32", + "__builtin_ia32_psrlqi": "_mm_srli_si64", + "__builtin_ia32_psrawi": "_mm_srai_pi16", + "__builtin_ia32_psradi": "_mm_srai_pi32", + "__builtin_ia32_packsswb": "_mm_packs_pi16", + "__builtin_ia32_packssdw": "_mm_packs_pi32", + "__builtin_ia32_packuswb": "_mm_packs_pu16", + "__builtin_ia32_punpckhbw": "_mm_unpackhi_pi8", + "__builtin_ia32_punpckhwd": "_mm_unpackhi_pi16", + "__builtin_ia32_punpckhdq": "_mm_unpackhi_pi32", + "__builtin_ia32_punpcklbw": "_mm_unpacklo_pi8", + "__builtin_ia32_punpcklwd": "_mm_unpacklo_pi16", + "__builtin_ia32_punpckldq": "_mm_unpacklo_pi32", + "__builtin_ia32_pcmpeqb": "_mm_cmpeq_pi8", + "__builtin_ia32_pcmpeqw": "_mm_cmpeq_pi16", + "__builtin_ia32_pcmpeqd": "_mm_cmpeq_pi32", + "__builtin_ia32_pcmpgtb": "_mm_cmpgt_pi8", + "__builtin_ia32_pcmpgtw": "_mm_cmpgt_pi16", + "__builtin_ia32_pcmpgtd": "_mm_cmpgt_pi32", + "__builtin_ia32_maskmovq": "_mm_maskmove_si64", + "__builtin_ia32_movntq": "_mm_stream_pi", + "__builtin_ia32_vec_init_v2si": "_mm_setr_pi32", + "__builtin_ia32_vec_init_v4hi": "_mm_setr_pi16", + "__builtin_ia32_vec_init_v8qi": "_mm_setr_pi8", + "__builtin_ia32_cvtpi2ps": "_mm_cvtpi32_ps", + "__builtin_ia32_cvtps2pi": "_mm_cvtps_pi32", + "__builtin_ia32_cvttps2pi": "_mm_cvttps_pi32", + "__builtin_ia32_pavgb": "_mm_avg_pu8", + "__builtin_ia32_pavgw": "_mm_avg_pu16", + "__builtin_ia32_pmaxsw": "_mm_max_pi16", + "__builtin_ia32_pmaxub": "_mm_max_pu8", + "__builtin_ia32_pminsw": "_mm_min_pi16", + "__builtin_ia32_pminub": "_mm_min_pu8", + "__builtin_ia32_pmovmskb": "_mm_movemask_pi8", + "__builtin_ia32_pmulhuw": "_mm_mulhi_pu16", + "__builtin_ia32_psadbw": "_mm_sad_pu8", + "__builtin_ia32_pshufw": "_mm_shuffle_pi16", + "__builtin_ia32_cvtpd2pi": "_mm_cvtpd_pi32", + "__builtin_ia32_cvtpi2pd": "_mm_cvtpi32_pd", + "__builtin_ia32_cvttpd2pi": "_mm_cvttpd_pi32", + "__builtin_ia32_paddq": "_mm_add_si64", + "__builtin_ia32_pmuludq": "_mm_mul_su32", + "__builtin_ia32_psubq": "_mm_sub_si64", + "__builtin_ia32_pabsb": "_mm_abs_pi8", + "__builtin_ia32_pabsd": "_mm_abs_pi32", + "__builtin_ia32_pabsw": "_mm_abs_pi16", + "__builtin_ia32_palignr": "_mm_alignr_pi8", + "__builtin_ia32_phaddd": "_mm_hadd_pi32", + "__builtin_ia32_phaddsw": "_mm_hadds_pi16", + "__builtin_ia32_phaddw": "_mm_hadd_pi16", + "__builtin_ia32_phsubd": "_mm_hsub_pi32", + "__builtin_ia32_phsubsw": "_mm_hsubs_pi16", + "__builtin_ia32_phsubw": "_mm_hsub_pi16", + "__builtin_ia32_pmaddubsw": "_mm_maddubs_pi16", + "__builtin_ia32_pmulhrsw": "_mm_mulhrs_pi16", + "__builtin_ia32_pshufb": "_mm_shuffle_pi8", + "__builtin_ia32_psignw": "_mm_sign_pi16", + "__builtin_ia32_psignb": "_mm_sign_pi8", + "__builtin_ia32_psignd": "_mm_sign_pi32", } # Special unhandled cases: diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index adc46f9789eb..b6a92136f382 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -222,11 +222,11 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse_cvttss2si64 : ClangBuiltin<"__builtin_ia32_cvttss2si64">, DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_v4f32_ty], [IntrNoMem]>; - def int_x86_sse_cvtps2pi : ClangBuiltin<"__builtin_ia32_cvtps2pi">, + def int_x86_sse_cvtps2pi : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_v4f32_ty], [IntrNoMem]>; - def int_x86_sse_cvttps2pi: ClangBuiltin<"__builtin_ia32_cvttps2pi">, + def int_x86_sse_cvttps2pi: DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_v4f32_ty], [IntrNoMem]>; - def int_x86_sse_cvtpi2ps : ClangBuiltin<"__builtin_ia32_cvtpi2ps">, + def int_x86_sse_cvtpi2ps : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_x86mmx_ty], [IntrNoMem]>; } @@ -426,11 +426,11 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse2_cvtsd2ss : ClangBuiltin<"__builtin_ia32_cvtsd2ss">, DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v2f64_ty], [IntrNoMem]>; - def int_x86_sse_cvtpd2pi : ClangBuiltin<"__builtin_ia32_cvtpd2pi">, + def int_x86_sse_cvtpd2pi : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_v2f64_ty], [IntrNoMem]>; - def int_x86_sse_cvttpd2pi: ClangBuiltin<"__builtin_ia32_cvttpd2pi">, + def int_x86_sse_cvttpd2pi: DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_v2f64_ty], [IntrNoMem]>; - def int_x86_sse_cvtpi2pd : ClangBuiltin<"__builtin_ia32_cvtpi2pd">, + def int_x86_sse_cvtpi2pd : DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_x86mmx_ty], [IntrNoMem]>; } @@ -512,49 +512,49 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Horizontal arithmetic ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_ssse3_phadd_w : ClangBuiltin<"__builtin_ia32_phaddw">, + def int_x86_ssse3_phadd_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_phadd_w_128 : ClangBuiltin<"__builtin_ia32_phaddw128">, DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; - def int_x86_ssse3_phadd_d : ClangBuiltin<"__builtin_ia32_phaddd">, + def int_x86_ssse3_phadd_d : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_phadd_d_128 : ClangBuiltin<"__builtin_ia32_phaddd128">, DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; - def int_x86_ssse3_phadd_sw : ClangBuiltin<"__builtin_ia32_phaddsw">, + def int_x86_ssse3_phadd_sw : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_phadd_sw_128 : ClangBuiltin<"__builtin_ia32_phaddsw128">, DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; - def int_x86_ssse3_phsub_w : ClangBuiltin<"__builtin_ia32_phsubw">, + def int_x86_ssse3_phsub_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_phsub_w_128 : ClangBuiltin<"__builtin_ia32_phsubw128">, DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; - def int_x86_ssse3_phsub_d : ClangBuiltin<"__builtin_ia32_phsubd">, + def int_x86_ssse3_phsub_d : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_phsub_d_128 : ClangBuiltin<"__builtin_ia32_phsubd128">, DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; - def int_x86_ssse3_phsub_sw : ClangBuiltin<"__builtin_ia32_phsubsw">, + def int_x86_ssse3_phsub_sw : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_phsub_sw_128 : ClangBuiltin<"__builtin_ia32_phsubsw128">, DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; - def int_x86_ssse3_pmadd_ub_sw : ClangBuiltin<"__builtin_ia32_pmaddubsw">, + def int_x86_ssse3_pmadd_ub_sw : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_pmadd_ub_sw_128 : ClangBuiltin<"__builtin_ia32_pmaddubsw128">, @@ -564,7 +564,7 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Packed multiply high with round and scale let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_ssse3_pmul_hr_sw : ClangBuiltin<"__builtin_ia32_pmulhrsw">, + def int_x86_ssse3_pmul_hr_sw : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_ssse3_pmul_hr_sw_128 : ClangBuiltin<"__builtin_ia32_pmulhrsw128">, @@ -574,34 +574,34 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Shuffle ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_ssse3_pshuf_b : ClangBuiltin<"__builtin_ia32_pshufb">, + def int_x86_ssse3_pshuf_b : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_pshuf_b_128 : ClangBuiltin<"__builtin_ia32_pshufb128">, DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; - def int_x86_sse_pshuf_w : ClangBuiltin<"__builtin_ia32_pshufw">, + def int_x86_sse_pshuf_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; } // Sign ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_ssse3_psign_b : ClangBuiltin<"__builtin_ia32_psignb">, + def int_x86_ssse3_psign_b : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_psign_b_128 : ClangBuiltin<"__builtin_ia32_psignb128">, DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; - def int_x86_ssse3_psign_w : ClangBuiltin<"__builtin_ia32_psignw">, + def int_x86_ssse3_psign_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_psign_w_128 : ClangBuiltin<"__builtin_ia32_psignw128">, DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; - def int_x86_ssse3_psign_d : ClangBuiltin<"__builtin_ia32_psignd">, + def int_x86_ssse3_psign_d : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_psign_d_128 : ClangBuiltin<"__builtin_ia32_psignd128">, @@ -611,13 +611,13 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Absolute value ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_ssse3_pabs_b : ClangBuiltin<"__builtin_ia32_pabsb">, + def int_x86_ssse3_pabs_b : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_ssse3_pabs_w : ClangBuiltin<"__builtin_ia32_pabsw">, + def int_x86_ssse3_pabs_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_ssse3_pabs_d : ClangBuiltin<"__builtin_ia32_pabsd">, + def int_x86_ssse3_pabs_d : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; } @@ -2260,118 +2260,118 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Integer arithmetic ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Addition - def int_x86_mmx_padd_b : ClangBuiltin<"__builtin_ia32_paddb">, + def int_x86_mmx_padd_b : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_padd_w : ClangBuiltin<"__builtin_ia32_paddw">, + def int_x86_mmx_padd_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_padd_d : ClangBuiltin<"__builtin_ia32_paddd">, + def int_x86_mmx_padd_d : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_padd_q : ClangBuiltin<"__builtin_ia32_paddq">, + def int_x86_mmx_padd_q : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_padds_b : ClangBuiltin<"__builtin_ia32_paddsb">, + def int_x86_mmx_padds_b : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_padds_w : ClangBuiltin<"__builtin_ia32_paddsw">, + def int_x86_mmx_padds_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_paddus_b : ClangBuiltin<"__builtin_ia32_paddusb">, + def int_x86_mmx_paddus_b : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_paddus_w : ClangBuiltin<"__builtin_ia32_paddusw">, + def int_x86_mmx_paddus_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; // Subtraction - def int_x86_mmx_psub_b : ClangBuiltin<"__builtin_ia32_psubb">, + def int_x86_mmx_psub_b : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_psub_w : ClangBuiltin<"__builtin_ia32_psubw">, + def int_x86_mmx_psub_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_psub_d : ClangBuiltin<"__builtin_ia32_psubd">, + def int_x86_mmx_psub_d : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_psub_q : ClangBuiltin<"__builtin_ia32_psubq">, + def int_x86_mmx_psub_q : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_psubs_b : ClangBuiltin<"__builtin_ia32_psubsb">, + def int_x86_mmx_psubs_b : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_psubs_w : ClangBuiltin<"__builtin_ia32_psubsw">, + def int_x86_mmx_psubs_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_psubus_b : ClangBuiltin<"__builtin_ia32_psubusb">, + def int_x86_mmx_psubus_b : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_psubus_w : ClangBuiltin<"__builtin_ia32_psubusw">, + def int_x86_mmx_psubus_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; // Multiplication - def int_x86_mmx_pmulh_w : ClangBuiltin<"__builtin_ia32_pmulhw">, + def int_x86_mmx_pmulh_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_pmull_w : ClangBuiltin<"__builtin_ia32_pmullw">, + def int_x86_mmx_pmull_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_pmulhu_w : ClangBuiltin<"__builtin_ia32_pmulhuw">, + def int_x86_mmx_pmulhu_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_pmulu_dq : ClangBuiltin<"__builtin_ia32_pmuludq">, + def int_x86_mmx_pmulu_dq : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_pmadd_wd : ClangBuiltin<"__builtin_ia32_pmaddwd">, + def int_x86_mmx_pmadd_wd : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; // Bitwise operations - def int_x86_mmx_pand : ClangBuiltin<"__builtin_ia32_pand">, + def int_x86_mmx_pand : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_pandn : ClangBuiltin<"__builtin_ia32_pandn">, + def int_x86_mmx_pandn : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_por : ClangBuiltin<"__builtin_ia32_por">, + def int_x86_mmx_por : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_pxor : ClangBuiltin<"__builtin_ia32_pxor">, + def int_x86_mmx_pxor : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; // Averages - def int_x86_mmx_pavg_b : ClangBuiltin<"__builtin_ia32_pavgb">, + def int_x86_mmx_pavg_b : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_pavg_w : ClangBuiltin<"__builtin_ia32_pavgw">, + def int_x86_mmx_pavg_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; // Maximum - def int_x86_mmx_pmaxu_b : ClangBuiltin<"__builtin_ia32_pmaxub">, + def int_x86_mmx_pmaxu_b : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_pmaxs_w : ClangBuiltin<"__builtin_ia32_pmaxsw">, + def int_x86_mmx_pmaxs_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; // Minimum - def int_x86_mmx_pminu_b : ClangBuiltin<"__builtin_ia32_pminub">, + def int_x86_mmx_pminu_b : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_pmins_w : ClangBuiltin<"__builtin_ia32_pminsw">, + def int_x86_mmx_pmins_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; // Packed sum of absolute differences - def int_x86_mmx_psad_bw : ClangBuiltin<"__builtin_ia32_psadbw">, + def int_x86_mmx_psad_bw : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; } @@ -2379,58 +2379,58 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Integer shift ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Shift left logical - def int_x86_mmx_psll_w : ClangBuiltin<"__builtin_ia32_psllw">, + def int_x86_mmx_psll_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_psll_d : ClangBuiltin<"__builtin_ia32_pslld">, + def int_x86_mmx_psll_d : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_psll_q : ClangBuiltin<"__builtin_ia32_psllq">, + def int_x86_mmx_psll_q : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_psrl_w : ClangBuiltin<"__builtin_ia32_psrlw">, + def int_x86_mmx_psrl_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_psrl_d : ClangBuiltin<"__builtin_ia32_psrld">, + def int_x86_mmx_psrl_d : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_psrl_q : ClangBuiltin<"__builtin_ia32_psrlq">, + def int_x86_mmx_psrl_q : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_psra_w : ClangBuiltin<"__builtin_ia32_psraw">, + def int_x86_mmx_psra_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_psra_d : ClangBuiltin<"__builtin_ia32_psrad">, + def int_x86_mmx_psra_d : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; // Oddly these don't require an immediate due to a gcc compatibility issue. - def int_x86_mmx_pslli_w : ClangBuiltin<"__builtin_ia32_psllwi">, + def int_x86_mmx_pslli_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_mmx_pslli_d : ClangBuiltin<"__builtin_ia32_pslldi">, + def int_x86_mmx_pslli_d : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_mmx_pslli_q : ClangBuiltin<"__builtin_ia32_psllqi">, + def int_x86_mmx_pslli_q : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_mmx_psrli_w : ClangBuiltin<"__builtin_ia32_psrlwi">, + def int_x86_mmx_psrli_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_mmx_psrli_d : ClangBuiltin<"__builtin_ia32_psrldi">, + def int_x86_mmx_psrli_d : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_mmx_psrli_q : ClangBuiltin<"__builtin_ia32_psrlqi">, + def int_x86_mmx_psrli_q : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_mmx_psrai_w : ClangBuiltin<"__builtin_ia32_psrawi">, + def int_x86_mmx_psrai_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_mmx_psrai_d : ClangBuiltin<"__builtin_ia32_psradi">, + def int_x86_mmx_psrai_d : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem]>; } @@ -2475,83 +2475,83 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". } // Pack ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_mmx_packsswb : ClangBuiltin<"__builtin_ia32_packsswb">, + def int_x86_mmx_packsswb : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_packssdw : ClangBuiltin<"__builtin_ia32_packssdw">, + def int_x86_mmx_packssdw : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_packuswb : ClangBuiltin<"__builtin_ia32_packuswb">, + def int_x86_mmx_packuswb : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; } // Unpacking ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_mmx_punpckhbw : ClangBuiltin<"__builtin_ia32_punpckhbw">, + def int_x86_mmx_punpckhbw : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_punpckhwd : ClangBuiltin<"__builtin_ia32_punpckhwd">, + def int_x86_mmx_punpckhwd : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_punpckhdq : ClangBuiltin<"__builtin_ia32_punpckhdq">, + def int_x86_mmx_punpckhdq : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_punpcklbw : ClangBuiltin<"__builtin_ia32_punpcklbw">, + def int_x86_mmx_punpcklbw : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_punpcklwd : ClangBuiltin<"__builtin_ia32_punpcklwd">, + def int_x86_mmx_punpcklwd : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_punpckldq : ClangBuiltin<"__builtin_ia32_punpckldq">, + def int_x86_mmx_punpckldq : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; } // Integer comparison ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_mmx_pcmpeq_b : ClangBuiltin<"__builtin_ia32_pcmpeqb">, + def int_x86_mmx_pcmpeq_b : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_pcmpeq_w : ClangBuiltin<"__builtin_ia32_pcmpeqw">, + def int_x86_mmx_pcmpeq_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_pcmpeq_d : ClangBuiltin<"__builtin_ia32_pcmpeqd">, + def int_x86_mmx_pcmpeq_d : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; - def int_x86_mmx_pcmpgt_b : ClangBuiltin<"__builtin_ia32_pcmpgtb">, + def int_x86_mmx_pcmpgt_b : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_pcmpgt_w : ClangBuiltin<"__builtin_ia32_pcmpgtw">, + def int_x86_mmx_pcmpgt_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_pcmpgt_d : ClangBuiltin<"__builtin_ia32_pcmpgtd">, + def int_x86_mmx_pcmpgt_d : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; } // Misc. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_mmx_maskmovq : ClangBuiltin<"__builtin_ia32_maskmovq">, + def int_x86_mmx_maskmovq : Intrinsic<[], [llvm_x86mmx_ty, llvm_x86mmx_ty, llvm_ptr_ty], []>; - def int_x86_mmx_pmovmskb : ClangBuiltin<"__builtin_ia32_pmovmskb">, + def int_x86_mmx_pmovmskb : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_x86mmx_ty], [IntrNoMem]>; - def int_x86_mmx_movnt_dq : ClangBuiltin<"__builtin_ia32_movntq">, + def int_x86_mmx_movnt_dq : Intrinsic<[], [llvm_ptr_ty, llvm_x86mmx_ty], []>; - def int_x86_mmx_palignr_b : ClangBuiltin<"__builtin_ia32_palignr">, + def int_x86_mmx_palignr_b : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; - def int_x86_mmx_pextr_w : ClangBuiltin<"__builtin_ia32_vec_ext_v4hi">, + def int_x86_mmx_pextr_w : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; - def int_x86_mmx_pinsr_w : ClangBuiltin<"__builtin_ia32_vec_set_v4hi">, + def int_x86_mmx_pinsr_w : DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>;