Clang: convert __m64 intrinsics to unconditionally use SSE2 instead of MMX. (#96540)

The MMX instruction set is legacy, and the SSE2 variants are in every way superior, when they are available -- and they have been available since the Pentium 4 was released, 20 years ago. Therefore, we are switching the "MMX" intrinsics to depend on SSE2, unconditionally. This change entirely drops the ability to generate vectorized code using compiler intrinsics for chips with MMX but without SSE2: the Intel Pentium MMX, Pentium, II, and Pentium III (released 1997-1999), as well as AMD K6 and K7 series chips of around the same timeframe. Targeting these older CPUs remains supported -- simply without the ability to use MMX compiler intrinsics. Migrating away from the use of MMX registers also fixes a rather non-obvious requirement. The long-standing programming model for these MMX intrinsics requires that the programmer be aware of the x87/MMX mode-switching semantics, and manually call `_mm_empty()` between using any MMX instruction and any x87 FPU instruction. If you neglect to, then every future x87 operation will return a NaN result. This requirement is not at all obvious to users of these these intrinsic functions, and causes very difficult to detect bugs. Worse, even if the user did write code that correctly calls `_mm_empty()` in the right places, LLVM may sometimes reorder x87 and mmx operations around each-other, unaware of this mode switching issue. Eliminating the use of MMX registers eliminates this problem. This change also deletes the now-unnecessary MMX `__builtin_ia32_*` functions from Clang. Only 3 MMX-related builtins remain in use -- `__builtin_ia32_emms`, used by `_mm_empty`, and `__builtin_ia32_vec_{ext,set}_v4si`, used by `_mm_insert_pi16` and `_mm_extract_pi16`. Note particularly that the latter two lower to generic, non-MMX, IR. Support for the LLVM intrinsics underlying these removed builtins still remains, for the moment. The file `clang/www/builtins.py` has been updated with mappings from the newly-removed `__builtin_ia32` functions to the still-supported equivalents in `mmintrin.h`. (Originally uploaded at https://reviews.llvm.org/D86855 and https://reviews.llvm.org/D94252) Fixes issue #41665 Works towards #98272
2025-04-18 15:46:45 +00:00 · 2024-07-24 17:00:12 -04:00 · 2024-07-24 17:00:12 -04:00 · 0431d6dab4
commit 0431d6dab4
parent 10ff2bcb5e
19 changed files with 686 additions and 701 deletions
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@ -172,6 +172,20 @@ AMDGPU Support
 X86 Support
 ^^^^^^^^^^^
 - The MMX vector intrinsic functions from ``*mmintrin.h`` which
  operate on `__m64` vectors, such as ``_mm_add_pi8``, have been
  reimplemented to use the SSE2 instruction-set and XMM registers
  unconditionally. These intrinsics are therefore *no longer
  supported* if MMX is enabled without SSE2 -- either from targeting
  CPUs from the Pentium-MMX through the Pentium 3, or explicitly via
  passing arguments such as ``-mmmx -mno-sse2``.
 - The compiler builtins such as ``__builtin_ia32_paddb`` which
  formerly implemented the above MMX intrinsic functions have been
  removed. Any uses of these removed functions should migrate to the
  functions defined by the ``*mmintrin.h`` headers. A mapping can be
  found in the file ``clang/www/builtins.py``.
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@ -47,107 +47,8 @@ TARGET_BUILTIN(__builtin_ia32_writeeflags_u32, "vUi", "n", "")
 // doesn't work in the presence of re-declaration of _mm_prefetch for windows.
 TARGET_BUILTIN(_mm_prefetch, "vcC*i", "nc", "mmx")
 TARGET_BUILTIN(__builtin_ia32_emms, "v", "n", "mmx")
-TARGET_BUILTIN(__builtin_ia32_paddb, "V8cV8cV8c", "ncV:64:", "mmx")
+TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "sV4sIi", "ncV:64:", "sse")
-TARGET_BUILTIN(__builtin_ia32_paddw, "V4sV4sV4s", "ncV:64:", "mmx")
+TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4ssIi", "ncV:64:", "sse")
 TARGET_BUILTIN(__builtin_ia32_paddd, "V2iV2iV2i", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_paddsb, "V8cV8cV8c", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_paddsw, "V4sV4sV4s", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_paddusb, "V8cV8cV8c", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_paddusw, "V4sV4sV4s", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psubb, "V8cV8cV8c", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psubw, "V4sV4sV4s", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psubd, "V2iV2iV2i", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psubsb, "V8cV8cV8c", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psubsw, "V4sV4sV4s", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psubusb, "V8cV8cV8c", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psubusw, "V4sV4sV4s", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_pmulhw, "V4sV4sV4s", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_pmullw, "V4sV4sV4s", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_pmaddwd, "V2iV4sV4s", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_pand, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_pandn, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_por, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_pxor, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psllw, "V4sV4sV1Oi", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_pslld, "V2iV2iV1Oi", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psllq, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psrlw, "V4sV4sV1Oi", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psrld, "V2iV2iV1Oi", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psrlq, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psraw, "V4sV4sV1Oi", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psrad, "V2iV2iV1Oi", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psllwi, "V4sV4si", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_pslldi, "V2iV2ii", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psllqi, "V1OiV1Oii", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psrlwi, "V4sV4si", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psrldi, "V2iV2ii", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psrlqi, "V1OiV1Oii", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psrawi, "V4sV4si", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_psradi, "V2iV2ii", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_packsswb, "V8cV4sV4s", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_packssdw, "V4sV2iV2i", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_packuswb, "V8cV4sV4s", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_punpckhbw, "V8cV8cV8c", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_punpckhwd, "V4sV4sV4s", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_punpckhdq, "V2iV2iV2i", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_punpcklbw, "V8cV8cV8c", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_punpcklwd, "V4sV4sV4s", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_punpckldq, "V2iV2iV2i", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_pcmpeqb, "V8cV8cV8c", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_pcmpeqw, "V4sV4sV4s", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_pcmpeqd, "V2iV2iV2i", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_pcmpgtb, "V8cV8cV8c", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_pcmpgtw, "V4sV4sV4s", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_pcmpgtd, "V2iV2iV2i", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_maskmovq, "vV8cV8cc*", "nV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_movntq, "vV1Oi*V1Oi", "nV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_vec_init_v2si, "V2iii", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_vec_init_v4hi, "V4sssss", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_vec_init_v8qi, "V8ccccccccc", "ncV:64:", "mmx")
 TARGET_BUILTIN(__builtin_ia32_vec_ext_v2si, "iV2ii", "ncV:64:", "mmx")
 // MMX2 (MMX+SSE) intrinsics
 TARGET_BUILTIN(__builtin_ia32_cvtpi2ps, "V4fV4fV2i", "ncV:64:", "mmx,sse")
 TARGET_BUILTIN(__builtin_ia32_cvtps2pi, "V2iV4f", "ncV:64:", "mmx,sse")
 TARGET_BUILTIN(__builtin_ia32_cvttps2pi, "V2iV4f", "ncV:64:", "mmx,sse")
 TARGET_BUILTIN(__builtin_ia32_pavgb, "V8cV8cV8c", "ncV:64:", "mmx,sse")
 TARGET_BUILTIN(__builtin_ia32_pavgw, "V4sV4sV4s", "ncV:64:", "mmx,sse")
 TARGET_BUILTIN(__builtin_ia32_pmaxsw, "V4sV4sV4s", "ncV:64:", "mmx,sse")
 TARGET_BUILTIN(__builtin_ia32_pmaxub, "V8cV8cV8c", "ncV:64:", "mmx,sse")
 TARGET_BUILTIN(__builtin_ia32_pminsw, "V4sV4sV4s", "ncV:64:", "mmx,sse")
 TARGET_BUILTIN(__builtin_ia32_pminub, "V8cV8cV8c", "ncV:64:", "mmx,sse")
 TARGET_BUILTIN(__builtin_ia32_pmovmskb, "iV8c", "ncV:64:", "mmx,sse")
 TARGET_BUILTIN(__builtin_ia32_pmulhuw, "V4sV4sV4s", "ncV:64:", "mmx,sse")
 TARGET_BUILTIN(__builtin_ia32_psadbw, "V4sV8cV8c", "ncV:64:", "mmx,sse")
 TARGET_BUILTIN(__builtin_ia32_pshufw, "V4sV4sIc", "ncV:64:", "mmx,sse")
 TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "iV4sIi", "ncV:64:", "mmx,sse")
 TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4siIi", "ncV:64:", "mmx,sse")
 // MMX+SSE2
 TARGET_BUILTIN(__builtin_ia32_cvtpd2pi, "V2iV2d", "ncV:64:", "mmx,sse2")
 TARGET_BUILTIN(__builtin_ia32_cvtpi2pd, "V2dV2i", "ncV:64:", "mmx,sse2")
 TARGET_BUILTIN(__builtin_ia32_cvttpd2pi, "V2iV2d", "ncV:64:", "mmx,sse2")
 TARGET_BUILTIN(__builtin_ia32_paddq, "V1OiV1OiV1Oi", "ncV:64:", "mmx,sse2")
 TARGET_BUILTIN(__builtin_ia32_pmuludq, "V1OiV2iV2i", "ncV:64:", "mmx,sse2")
 TARGET_BUILTIN(__builtin_ia32_psubq, "V1OiV1OiV1Oi", "ncV:64:", "mmx,sse2")
 // MMX+SSSE3
 TARGET_BUILTIN(__builtin_ia32_pabsb, "V8cV8c", "ncV:64:", "mmx,ssse3")
 TARGET_BUILTIN(__builtin_ia32_pabsd, "V2iV2i", "ncV:64:", "mmx,ssse3")
 TARGET_BUILTIN(__builtin_ia32_pabsw, "V4sV4s", "ncV:64:", "mmx,ssse3")
 TARGET_BUILTIN(__builtin_ia32_palignr, "V8cV8cV8cIc", "ncV:64:", "mmx,ssse3")
 TARGET_BUILTIN(__builtin_ia32_phaddd, "V2iV2iV2i", "ncV:64:", "mmx,ssse3")
 TARGET_BUILTIN(__builtin_ia32_phaddsw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
 TARGET_BUILTIN(__builtin_ia32_phaddw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
 TARGET_BUILTIN(__builtin_ia32_phsubd, "V2iV2iV2i", "ncV:64:", "mmx,ssse3")
 TARGET_BUILTIN(__builtin_ia32_phsubsw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
 TARGET_BUILTIN(__builtin_ia32_phsubw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
 TARGET_BUILTIN(__builtin_ia32_pmaddubsw, "V8cV8cV8c", "ncV:64:", "mmx,ssse3")
 TARGET_BUILTIN(__builtin_ia32_pmulhrsw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
 TARGET_BUILTIN(__builtin_ia32_pshufb, "V8cV8cV8c", "ncV:64:", "mmx,ssse3")
 TARGET_BUILTIN(__builtin_ia32_psignw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
 TARGET_BUILTIN(__builtin_ia32_psignb, "V8cV8cV8c", "ncV:64:", "mmx,ssse3")
 TARGET_BUILTIN(__builtin_ia32_psignd, "V2iV2iV2i", "ncV:64:", "mmx,ssse3")
 // SSE intrinsics.
 TARGET_BUILTIN(__builtin_ia32_comieq, "iV4fV4f", "ncV:128:", "sse")
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@ -14523,12 +14523,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
    // TODO: If we had a "freeze" IR instruction to generate a fixed undef
    // value, we should use that here instead of a zero.
    return llvm::Constant::getNullValue(ConvertType(E->getType()));
-  case X86::BI__builtin_ia32_vec_init_v8qi:
+  case X86::BI__builtin_ia32_vec_ext_v4hi:
  case X86::BI__builtin_ia32_vec_init_v4hi:
  case X86::BI__builtin_ia32_vec_init_v2si:
    return Builder.CreateBitCast(BuildVector(Ops),
                                 llvm::Type::getX86_MMXTy(getLLVMContext()));
  case X86::BI__builtin_ia32_vec_ext_v2si:
  case X86::BI__builtin_ia32_vec_ext_v16qi:
  case X86::BI__builtin_ia32_vec_ext_v8hi:
  case X86::BI__builtin_ia32_vec_ext_v4si:
@ -14546,6 +14541,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
    // Otherwise we could just do this in the header file.
    return Builder.CreateExtractElement(Ops[0], Index);
  }
  case X86::BI__builtin_ia32_vec_set_v4hi:
  case X86::BI__builtin_ia32_vec_set_v16qi:
  case X86::BI__builtin_ia32_vec_set_v8hi:
  case X86::BI__builtin_ia32_vec_set_v4si:
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@ -52,9 +52,12 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("sse2,no-evex512"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX                                                 \
+
-  __attribute__((__always_inline__, __nodebug__,                               \
+#define __trunc64(x)                                                           \
-                 __target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
 #define __anyext128(x)                                                         \
  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
                                    1, -1, -1)
 /// Adds lower double-precision values in both operands and returns the
 ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
@ -1486,8 +1489,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
 /// \param __a
 ///    A 128-bit vector of [2 x double].
 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) {
-  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
+  return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a));
 }
 /// Converts the two double-precision floating-point elements of a
@ -1505,8 +1508,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
 /// \param __a
 ///    A 128-bit vector of [2 x double].
 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) {
-  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
+  return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a));
 }
 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
@ -1520,8 +1523,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
 /// \param __a
 ///    A 64-bit vector of [2 x i32].
 /// \returns A 128-bit vector of [2 x double] containing the converted values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtpi32_pd(__m64 __a) {
-  return __builtin_ia32_cvtpi2pd((__v2si)__a);
+  return (__m128d) __builtin_convertvector((__v2si)__a, __v2df);
 }
 /// Returns the low-order element of a 128-bit vector of [2 x double] as
@ -2108,9 +2111,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
 /// \param __b
 ///    A 64-bit integer.
 /// \returns A 64-bit integer containing the sum of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) {
-                                                            __m64 __b) {
+  return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b));
  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
 }
 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
@ -2431,9 +2433,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
 /// \param __b
 ///    A 64-bit integer containing one of the source operands.
 /// \returns A 64-bit integer vector containing the product of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
-                                                            __m64 __b) {
+  return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
-  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
+                                             (__v4si)__anyext128(__b)));
 }
 /// Multiplies 32-bit unsigned integer values contained in the lower
@ -2539,9 +2541,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
 ///    A 64-bit integer vector containing the subtrahend.
 /// \returns A 64-bit integer vector containing the difference of the values in
 ///    the operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) {
-                                                            __m64 __b) {
+  return (__m64)((unsigned long long)__a - (unsigned long long)__b);
  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
 }
 /// Subtracts the corresponding elements of two [2 x i64] vectors.
@ -4889,8 +4890,10 @@ void _mm_pause(void);
 #if defined(__cplusplus)
 } // extern "C"
 #endif
 #undef __anyext128
 #undef __trunc64
 #undef __DEFAULT_FN_ATTRS
 #undef __DEFAULT_FN_ATTRS_MMX
 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@ -21,10 +21,33 @@ typedef int __v2si __attribute__((__vector_size__(8)));
 typedef short __v4hi __attribute__((__vector_size__(8)));
 typedef char __v8qi __attribute__((__vector_size__(8)));
 /* Unsigned types */
 typedef unsigned long long __v1du __attribute__ ((__vector_size__ (8)));
 typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
 typedef unsigned short __v4hu __attribute__((__vector_size__(8)));
 typedef unsigned char __v8qu __attribute__((__vector_size__(8)));
 /* We need an explicitly signed variant for char. Note that this shouldn't
 * appear in the interface though. */
 typedef signed char __v8qs __attribute__((__vector_size__(8)));
 /* SSE/SSE2 types */
 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
 typedef int __v4si __attribute__((__vector_size__(16)));
 typedef short __v8hi __attribute__((__vector_size__(16)));
 typedef char __v16qi __attribute__((__vector_size__(16)));
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
+#define __DEFAULT_FN_ATTRS_SSE2                                                \
-  __attribute__((__always_inline__, __nodebug__, __target__("mmx,no-evex512"), \
+  __attribute__((__always_inline__, __nodebug__,                               \
-                 __min_vector_width__(64)))
+                 __target__("sse2,no-evex512"), __min_vector_width__(128)))
 #define __trunc64(x)                                                           \
  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
 #define __anyext128(x)                                                         \
  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
                                    1, -1, -1)
 /// Clears the MMX state by setting the state of the x87 stack registers
 ///    to empty.
@ -50,10 +73,10 @@ _mm_empty(void) {
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
 ///    parameter. The upper 32 bits are set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi32_si64(int __i)
 {
-    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
+    return __extension__ (__m64)(__v2si){__i, 0};
 }
 /// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
@ -67,10 +90,10 @@ _mm_cvtsi32_si64(int __i)
 ///    A 64-bit integer vector.
 /// \returns A 32-bit signed integer value containing the lower 32 bits of the
 ///    parameter.
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi64_si32(__m64 __m)
 {
-    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
+    return ((__v2si)__m)[0];
 }
 /// Casts a 64-bit signed integer value into a 64-bit integer vector.
@ -83,7 +106,7 @@ _mm_cvtsi64_si32(__m64 __m)
 ///    A 64-bit signed integer.
 /// \returns A 64-bit integer vector containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi64_m64(long long __i)
 {
    return (__m64)__i;
@ -99,7 +122,7 @@ _mm_cvtsi64_m64(long long __i)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit signed integer containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ long long __DEFAULT_FN_ATTRS
+static __inline__ long long __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtm64_si64(__m64 __m)
 {
    return (long long)__m;
@ -124,10 +147,11 @@ _mm_cvtm64_si64(__m64 __m)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_packsswb128(
        (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
 }
 /// Converts, with saturation, 32-bit signed integers from both 64-bit integer
@ -149,10 +173,11 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
+    return __trunc64(__builtin_ia32_packssdw128(
        (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
 }
 /// Converts, with saturation, 16-bit signed integers from both 64-bit integer
@ -174,10 +199,11 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_packuswb128(
        (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
 }
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
@ -201,10 +227,11 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2)
 ///    Bits [63:56] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
                                          4, 12, 5, 13, 6, 14, 7, 15);
 }
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
@ -224,10 +251,11 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
 ///    Bits [63:48] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
                                          2, 6, 3, 7);
 }
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
@ -245,10 +273,10 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
 ///    the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
+    return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3);
 }
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
@ -272,10 +300,11 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
 ///    Bits [31:24] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
                                          0, 8, 1, 9, 2, 10, 3, 11);
 }
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
@ -295,10 +324,11 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
 ///    Bits [31:16] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
                                          0, 4, 1, 5);
 }
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
@ -316,10 +346,10 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
 ///    the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
+    return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2);
 }
 /// Adds each 8-bit integer element of the first 64-bit integer vector
@ -337,10 +367,10 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2));
 }
 /// Adds each 16-bit integer element of the first 64-bit integer vector
@ -358,10 +388,10 @@ _mm_add_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2));
 }
 /// Adds each 32-bit integer element of the first 64-bit integer vector
@ -379,10 +409,10 @@ _mm_add_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2su)__m1) + ((__v2su)__m2));
 }
 /// Adds, with saturation, each 8-bit signed integer element of the first
@ -403,10 +433,10 @@ _mm_add_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
 ///    of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_elementwise_add_sat((__v8qs)__m1, (__v8qs)__m2);
 }
 /// Adds, with saturation, each 16-bit signed integer element of the first
@ -427,10 +457,10 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
 ///    of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_elementwise_add_sat((__v4hi)__m1, (__v4hi)__m2);
 }
 /// Adds, with saturation, each 8-bit unsigned integer element of the first
@ -450,10 +480,10 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pu8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_elementwise_add_sat((__v8qu)__m1, (__v8qu)__m2);
 }
 /// Adds, with saturation, each 16-bit unsigned integer element of the first
@ -473,10 +503,10 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_elementwise_add_sat((__v4hu)__m1, (__v4hu)__m2);
 }
 /// Subtracts each 8-bit integer element of the second 64-bit integer
@ -494,10 +524,10 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2));
 }
 /// Subtracts each 16-bit integer element of the second 64-bit integer
@ -515,10 +545,10 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2));
 }
 /// Subtracts each 32-bit integer element of the second 64-bit integer
@ -536,10 +566,10 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
 /// \returns A 64-bit integer vector of [2 x i32] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2su)__m1) - ((__v2su)__m2));
 }
 /// Subtracts, with saturation, each 8-bit signed integer element of the second
@ -560,10 +590,10 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_elementwise_sub_sat((__v8qs)__m1, (__v8qs)__m2);
 }
 /// Subtracts, with saturation, each 16-bit signed integer element of the
@ -584,10 +614,10 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_elementwise_sub_sat((__v4hi)__m1, (__v4hi)__m2);
 }
 /// Subtracts each 8-bit unsigned integer element of the second 64-bit
@ -608,10 +638,10 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pu8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_elementwise_sub_sat((__v8qu)__m1, (__v8qu)__m2);
 }
 /// Subtracts each 16-bit unsigned integer element of the second 64-bit
@ -632,10 +662,10 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_elementwise_sub_sat((__v4hu)__m1, (__v4hu)__m2);
 }
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@ -659,10 +689,11 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of
 ///    products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_madd_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1),
                                               (__v8hi)__anyext128(__m2)));
 }
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@ -680,10 +711,11 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
 ///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1),
                                              (__v8hi)__anyext128(__m2)));
 }
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@ -701,10 +733,10 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
 ///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2));
 }
 /// Left-shifts each 16-bit signed integer element of the first
@ -724,10 +756,11 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2)
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
 ///    values. If \a __count is greater or equal to 16, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m),
                                             (__v8hi)__anyext128(__count)));
 }
 /// Left-shifts each 16-bit signed integer element of a 64-bit integer
@ -746,10 +779,11 @@ _mm_sll_pi16(__m64 __m, __m64 __count)
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
 ///    values. If \a __count is greater or equal to 16, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m),
                                              __count));
 }
 /// Left-shifts each 32-bit signed integer element of the first
@ -769,10 +803,11 @@ _mm_slli_pi16(__m64 __m, int __count)
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
 ///    values. If \a __count is greater or equal to 32, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m),
                                             (__v4si)__anyext128(__count)));
 }
 /// Left-shifts each 32-bit signed integer element of a 64-bit integer
@ -791,10 +826,11 @@ _mm_sll_pi32(__m64 __m, __m64 __count)
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
 ///    values. If \a __count is greater or equal to 32, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m),
                                              __count));
 }
 /// Left-shifts the first 64-bit integer parameter by the number of bits
@ -811,10 +847,11 @@ _mm_slli_pi32(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
 ///     \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_si64(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m),
                                             (__v2di)__anyext128(__count)));
 }
 /// Left-shifts the first parameter, which is a 64-bit integer, by the
@ -831,10 +868,11 @@ _mm_sll_si64(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
 ///     \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_si64(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m),
                                              __count));
 }
 /// Right-shifts each 16-bit integer element of the first parameter,
@ -855,10 +893,11 @@ _mm_slli_si64(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sra_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m),
                                             (__v8hi)__anyext128(__count)));
 }
 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
@ -878,10 +917,11 @@ _mm_sra_pi16(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srai_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m),
                                              __count));
 }
 /// Right-shifts each 32-bit integer element of the first parameter,
@ -902,10 +942,11 @@ _mm_srai_pi16(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sra_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m),
                                             (__v4si)__anyext128(__count)));
 }
 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
@ -925,10 +966,11 @@ _mm_sra_pi32(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srai_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m),
                                              __count));
 }
 /// Right-shifts each 16-bit integer element of the first parameter,
@ -948,10 +990,11 @@ _mm_srai_pi32(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m),
                                             (__v8hi)__anyext128(__count)));
 }
 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
@ -970,10 +1013,11 @@ _mm_srl_pi16(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m),
                                              __count));
 }
 /// Right-shifts each 32-bit integer element of the first parameter,
@ -993,10 +1037,11 @@ _mm_srli_pi16(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m),
                                             (__v4si)__anyext128(__count)));
 }
 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
@ -1015,10 +1060,11 @@ _mm_srl_pi32(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m),
                                              __count));
 }
 /// Right-shifts the first 64-bit integer parameter by the number of bits
@ -1035,10 +1081,11 @@ _mm_srli_pi32(__m64 __m, int __count)
 /// \param __count
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_si64(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m),
                                             (__v2di)__anyext128(__count)));
 }
 /// Right-shifts the first parameter, which is a 64-bit integer, by the
@ -1056,10 +1103,11 @@ _mm_srl_si64(__m64 __m, __m64 __count)
 /// \param __count
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_si64(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m),
                                              __count));
 }
 /// Performs a bitwise AND of two 64-bit integer vectors.
@ -1074,10 +1122,10 @@ _mm_srli_si64(__m64 __m, int __count)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise AND of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_and_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1du)__m1) & ((__v1du)__m2));
 }
 /// Performs a bitwise NOT of the first 64-bit integer vector, and then
@ -1095,10 +1143,10 @@ _mm_and_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise AND of the second
 ///    parameter and the one's complement of the first parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_andnot_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(~((__v1du)__m1) & ((__v1du)__m2));
 }
 /// Performs a bitwise OR of two 64-bit integer vectors.
@ -1113,10 +1161,10 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise OR of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_or_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1du)__m1) | ((__v1du)__m2));
 }
 /// Performs a bitwise exclusive OR of two 64-bit integer vectors.
@ -1131,10 +1179,10 @@ _mm_or_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_xor_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2));
 }
 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
@ -1153,10 +1201,10 @@ _mm_xor_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2));
 }
 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
@ -1175,10 +1223,10 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2));
 }
 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
@ -1197,10 +1245,10 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2si)__m1) == ((__v2si)__m2));
 }
 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
@ -1219,10 +1267,12 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
+  /* This function always performs a signed comparison, but __v8qi is a char
     which may be signed or unsigned, so use __v8qs. */
    return (__m64)((__v8qs)__m1 > (__v8qs)__m2);
 }
 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
@ -1241,10 +1291,10 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)((__v4hi)__m1 > (__v4hi)__m2);
 }
 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
@ -1263,10 +1313,10 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)((__v2si)__m1 > (__v2si)__m2);
 }
 /// Constructs a 64-bit integer vector initialized to zero.
@ -1276,7 +1326,7 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
 ///
 /// \returns An initialized 64-bit integer vector with all elements set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setzero_si64(void)
 {
    return __extension__ (__m64){ 0LL };
@ -1297,10 +1347,10 @@ _mm_setzero_si64(void)
 ///    A 32-bit integer value used to initialize the lower 32 bits of the
 ///    result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set_pi32(int __i1, int __i0)
 {
-    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
+    return __extension__ (__m64)(__v2si){__i0, __i1};
 }
 /// Constructs a 64-bit integer vector initialized with the specified
@ -1320,10 +1370,10 @@ _mm_set_pi32(int __i1, int __i0)
 /// \param __s0
 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
 {
-    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
+    return __extension__ (__m64)(__v4hi){__s0, __s1, __s2, __s3};
 }
 /// Constructs a 64-bit integer vector initialized with the specified
@ -1351,12 +1401,12 @@ _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
 /// \param __b0
 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
            char __b1, char __b0)
 {
-    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
+    return __extension__ (__m64)(__v8qi){__b0, __b1, __b2, __b3,
-                                               __b4, __b5, __b6, __b7);
+                                         __b4, __b5, __b6, __b7};
 }
 /// Constructs a 64-bit integer vector of [2 x i32], with each of the
@ -1372,7 +1422,7 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
 ///    A 32-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [2 x i32].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set1_pi32(int __i)
 {
    return _mm_set_pi32(__i, __i);
@ -1391,7 +1441,7 @@ _mm_set1_pi32(int __i)
 ///    A 16-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [4 x i16].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set1_pi16(short __w)
 {
    return _mm_set_pi16(__w, __w, __w, __w);
@ -1409,7 +1459,7 @@ _mm_set1_pi16(short __w)
 ///    An 8-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [8 x i8].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set1_pi8(char __b)
 {
    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
@ -1430,7 +1480,7 @@ _mm_set1_pi8(char __b)
 ///    A 32-bit integer value used to initialize the upper 32 bits of the
 ///    result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setr_pi32(int __i0, int __i1)
 {
    return _mm_set_pi32(__i1, __i0);
@ -1453,7 +1503,7 @@ _mm_setr_pi32(int __i0, int __i1)
 /// \param __w3
 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
 {
    return _mm_set_pi16(__w3, __w2, __w1, __w0);
@ -1484,14 +1534,16 @@ _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
 /// \param __b7
 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
             char __b6, char __b7)
 {
    return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
 }
-#undef __DEFAULT_FN_ATTRS
+#undef __anyext128
 #undef __trunc64
 #undef __DEFAULT_FN_ATTRS_SSE2
 /* Aliases for compatibility. */
 #define _m_empty _mm_empty
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@ -19,11 +19,13 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("ssse3,no-evex512"), __min_vector_width__(64)))
+                 __target__("ssse3,no-evex512"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX                                                 \
+
-  __attribute__((__always_inline__, __nodebug__,                               \
+#define __trunc64(x)                                                           \
-                 __target__("mmx,ssse3,no-evex512"),                           \
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
-                 __min_vector_width__(64)))
+#define __anyext128(x)                                                         \
  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
                                    1, -1, -1)
 /// Computes the absolute value of each of the packed 8-bit signed
 ///    integers in the source operand and stores the 8-bit unsigned integer
@ -37,10 +39,10 @@
 ///    A 64-bit vector of [8 x i8].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi8(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
+  return (__m64)__builtin_elementwise_abs((__v8qs)__a);
 }
 /// Computes the absolute value of each of the packed 8-bit signed
@ -73,10 +75,10 @@ _mm_abs_epi8(__m128i __a)
 ///    A 64-bit vector of [4 x i16].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi16(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
+    return (__m64)__builtin_elementwise_abs((__v4hi)__a);
 }
 /// Computes the absolute value of each of the packed 16-bit signed
@ -109,10 +111,10 @@ _mm_abs_epi16(__m128i __a)
 ///    A 64-bit vector of [2 x i32].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi32(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
+    return (__m64)__builtin_elementwise_abs((__v2si)__a);
 }
 /// Computes the absolute value of each of the packed 32-bit signed
@ -177,7 +179,10 @@ _mm_abs_epi32(__m128i __a)
 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 ///    value.
 #define _mm_alignr_pi8(a, b, n) \
-  ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
+  ((__m64)__builtin_shufflevector(                                       \
       __builtin_ia32_psrldqi128_byteshift(                              \
           __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0),      \
           (n)), __extension__ (__v2di){}, 0))
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    128-bit vectors of [8 x i16].
@ -242,10 +247,11 @@ _mm_hadd_epi32(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_phaddw128(
        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@ -265,10 +271,11 @@ _mm_hadd_pi16(__m64 __a, __m64 __b)
 ///    destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
+    return __trunc64(__builtin_ia32_phaddd128(
        (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
 }
 /// Horizontally adds, with saturation, the adjacent pairs of values contained
@ -317,10 +324,11 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadds_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_phaddsw128(
        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@ -386,10 +394,11 @@ _mm_hsub_epi32(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_phsubw128(
        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@ -409,10 +418,11 @@ _mm_hsub_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
+    return __trunc64(__builtin_ia32_phsubd128(
        (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
 }
 /// Horizontally subtracts, with saturation, the adjacent pairs of values
@ -461,10 +471,11 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_phsubsw128(
        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@ -525,10 +536,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
                                                 (__v16qi)__anyext128(__b)));
 }
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
@ -565,10 +577,11 @@ _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
 ///    products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a),
                                                (__v8hi)__anyext128(__b)));
 }
 /// Copies the 8-bit integers from a 128-bit integer vector to the
@ -614,12 +627,15 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
 ///    1: Clear the corresponding byte in the destination. \n
 ///    0: Copy the selected source byte to the corresponding byte in the
 ///    destination. \n
-///    Bits [3:0] select the source byte to be copied.
+///    Bits [2:0] select the source byte to be copied.
 /// \returns A 64-bit integer vector containing the copied or cleared values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_shuffle_pi8(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_pshufb128(
        (__v16qi)__builtin_shufflevector(
            (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1),
        (__v16qi)__anyext128(__b)));
 }
 /// For each 8-bit integer in the first source operand, perform one of
@ -720,10 +736,11 @@ _mm_sign_epi32(__m128i __a, __m128i __b)
 ///    A 64-bit integer vector containing control bytes corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi8(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a),
                                              (__v16qi)__anyext128(__b)));
 }
 /// For each 16-bit integer in the first source operand, perform one of
@ -746,10 +763,11 @@ _mm_sign_pi8(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing control words corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a),
                                              (__v8hi)__anyext128(__b)));
 }
 /// For each 32-bit integer in the first source operand, perform one of
@ -772,13 +790,15 @@ _mm_sign_pi16(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing two control doublewords corresponding
 ///    to positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
+    return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a),
                                              (__v4si)__anyext128(__b)));
 }
 #undef __anyext128
 #undef __trunc64
 #undef __DEFAULT_FN_ATTRS
 #undef __DEFAULT_FN_ATTRS_MMX
 #endif /* __TMMINTRIN_H */
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@ -35,9 +35,21 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX                                                 \
+#define __DEFAULT_FN_ATTRS_SSE2                                                \
  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("mmx,sse,no-evex512"), __min_vector_width__(64)))
+                 __target__("sse2,no-evex512"), __min_vector_width__(128)))
 #define __trunc64(x)                                                           \
  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
 #define __zext128(x)                                                           \
  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
                                    1, 2, 3)
 #define __anyext128(x)                                                         \
  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
                                    1, -1, -1)
 #define __zeroupper64(x)                                                       \
  (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0,   \
                                    1, 4, 5)
 /// Adds the 32-bit float values in the low-order bits of the operands.
 ///
@ -1448,10 +1460,10 @@ _mm_cvtss_si64(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi32(__m128 __a)
 {
-  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
+  return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
 }
 /// Converts two low-order float values in a 128-bit vector of
@ -1468,7 +1480,7 @@ _mm_cvtps_pi32(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvt_ps2pi(__m128 __a)
 {
  return _mm_cvtps_pi32(__a);
@ -1558,10 +1570,10 @@ _mm_cvttss_si64(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvttps_pi32(__m128 __a)
 {
-  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
+  return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
 }
 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
@ -1579,7 +1591,7 @@ _mm_cvttps_pi32(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtt_ps2pi(__m128 __a)
 {
  return _mm_cvttps_pi32(__a);
@ -1674,10 +1686,13 @@ _mm_cvtsi64_ss(__m128 __a, long long __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    converted value of the second operand. The upper 64 bits are copied from
 ///    the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
 {
-  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
+  return (__m128)__builtin_shufflevector(
      (__v4sf)__a,
      __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
      4, 5, 2, 3);
 }
 /// Converts two elements of a 64-bit vector of [2 x i32] into two
@ -1697,7 +1712,7 @@ _mm_cvtpi32_ps(__m128 __a, __m64 __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    converted value from the second operand. The upper 64 bits are copied
 ///    from the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
 {
  return _mm_cvtpi32_ps(__a, __b);
@ -2231,10 +2246,10 @@ _mm_storer_ps(float *__p, __m128 __a)
 ///    A pointer to an aligned memory location used to store the register value.
 /// \param __a
 ///    A 64-bit integer containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_pi(void *__p, __m64 __a)
 {
-  __builtin_ia32_movntq((__m64 *)__p, __a);
+  __builtin_nontemporal_store(__a, (__m64 *)__p);
 }
 /// Moves packed float values from a 128-bit vector of [4 x float] to a
@ -2296,7 +2311,7 @@ void _mm_sfence(void);
 ///    3: Bits [63:48] are copied to the destination.
 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
 #define _mm_extract_pi16(a, n) \
-  ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
+  ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
@ -2342,10 +2357,10 @@ void _mm_sfence(void);
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_max_pi16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
+  return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b);
 }
 /// Compares each of the corresponding packed 8-bit unsigned integer
@ -2361,10 +2376,10 @@ _mm_max_pi16(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_max_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
+  return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b);
 }
 /// Compares each of the corresponding packed 16-bit integer values of
@ -2380,10 +2395,10 @@ _mm_max_pu8(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_min_pi16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
+  return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b);
 }
 /// Compares each of the corresponding packed 8-bit unsigned integer
@ -2399,10 +2414,10 @@ _mm_min_pi16(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_min_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
+  return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b);
 }
 /// Takes the most significant bit from each 8-bit element in a 64-bit
@ -2417,10 +2432,10 @@ _mm_min_pu8(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing the values with bits to be extracted.
 /// \returns The most significant bit from each 8-bit element in \a __a,
 ///    written to bits [7:0].
-static __inline__ int __DEFAULT_FN_ATTRS_MMX
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2
 _mm_movemask_pi8(__m64 __a)
 {
-  return __builtin_ia32_pmovmskb((__v8qi)__a);
+  return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
 }
 /// Multiplies packed 16-bit unsigned integer values and writes the
@ -2436,10 +2451,11 @@ _mm_movemask_pi8(__m64 __a)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mulhi_pu16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
+  return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a),
                                             (__v8hi)__anyext128(__b)));
 }
 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
@ -2476,8 +2492,10 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
 ///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 64-bit integer vector containing the shuffled values.
-#define _mm_shuffle_pi16(a, n) \
+#define _mm_shuffle_pi16(a, n)                                                 \
-  ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
+  ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \
                                  (n) & 0x3, ((n) >> 2) & 0x3,                 \
                                  ((n) >> 4) & 0x3, ((n) >> 6) & 0x3))
 /// Conditionally copies the values from each 8-bit element in the first
 ///    64-bit integer vector operand to the specified memory location, as
@ -2502,10 +2520,25 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///    A pointer to a 64-bit memory location that will receive the conditionally
 ///    copied integer values. The address of the memory location does not have
 ///    to be aligned.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
+static __inline__ void __DEFAULT_FN_ATTRS_SSE2
 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
 {
-  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
+  // This is complex, because we need to support the case where __p is pointing
  // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
  // write might cause a trap where a 64-bit maskmovq would not. (Memory
  // locations not selected by the mask bits might still cause traps.)
  __m128i __d128  = __anyext128(__d);
  __m128i __n128  = __zext128(__n);
  if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
      ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
    // If there's a risk of spurious trap due to a 128-bit write, back up the
    // pointer by 8 bytes and shift values in registers to match.
    __p -= 8;
    __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
    __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
  }
  __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
 }
 /// Computes the rounded averages of the packed unsigned 8-bit integer
@ -2521,10 +2554,11 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_avg_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
+  return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a),
                                           (__v16qi)__anyext128(__b)));
 }
 /// Computes the rounded averages of the packed unsigned 16-bit integer
@ -2540,10 +2574,11 @@ _mm_avg_pu8(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_avg_pu16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
+  return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a),
                                           (__v8hi)__anyext128(__b)));
 }
 /// Subtracts the corresponding 8-bit unsigned integer values of the two
@ -2562,10 +2597,11 @@ _mm_avg_pu16(__m64 __a, __m64 __b)
 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
 ///    sets of absolute differences between both operands. The upper bits are
 ///    cleared.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sad_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
+  return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
                                            (__v16qi)__zext128(__b)));
 }
 #if defined(__cplusplus)
@ -2846,22 +2882,10 @@ _mm_movelh_ps(__m128 __a, __m128 __b)
 ///    from the corresponding elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi16_ps(__m64 __a)
 {
-  __m64 __b, __c;
+  return __builtin_convertvector((__v4hi)__a, __v4sf);
  __m128 __r;
  __b = _mm_setzero_si64();
  __b = _mm_cmpgt_pi16(__b, __a);
  __c = _mm_unpackhi_pi16(__a, __b);
  __r = _mm_setzero_ps();
  __r = _mm_cvtpi32_ps(__r, __c);
  __r = _mm_movelh_ps(__r, __r);
  __c = _mm_unpacklo_pi16(__a, __b);
  __r = _mm_cvtpi32_ps(__r, __c);
  return __r;
 }
 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
@ -2876,21 +2900,10 @@ _mm_cvtpi16_ps(__m64 __a)
 ///    destination are copied from the corresponding elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpu16_ps(__m64 __a)
 {
-  __m64 __b, __c;
+  return __builtin_convertvector((__v4hu)__a, __v4sf);
  __m128 __r;
  __b = _mm_setzero_si64();
  __c = _mm_unpackhi_pi16(__a, __b);
  __r = _mm_setzero_ps();
  __r = _mm_cvtpi32_ps(__r, __c);
  __r = _mm_movelh_ps(__r, __r);
  __c = _mm_unpacklo_pi16(__a, __b);
  __r = _mm_cvtpi32_ps(__r, __c);
  return __r;
 }
 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
@ -2905,16 +2918,12 @@ _mm_cvtpu16_ps(__m64 __a)
 ///    from the corresponding lower 4 elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi8_ps(__m64 __a)
 {
-  __m64 __b;
+  return __builtin_convertvector(
-
+      __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
-  __b = _mm_setzero_si64();
+                              0, 1, 2, 3), __v4sf);
  __b = _mm_cmpgt_pi8(__b, __a);
  __b = _mm_unpacklo_pi8(__a, __b);
  return _mm_cvtpi16_ps(__b);
 }
 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
@ -2930,15 +2939,12 @@ _mm_cvtpi8_ps(__m64 __a)
 ///    operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpu8_ps(__m64 __a)
 {
-  __m64 __b;
+  return __builtin_convertvector(
-
+      __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
-  __b = _mm_setzero_si64();
+                              0, 1, 2, 3), __v4sf);
  __b = _mm_unpacklo_pi8(__a, __b);
  return _mm_cvtpi16_ps(__b);
 }
 /// Converts the two 32-bit signed integer values from each 64-bit vector
@ -2957,16 +2963,12 @@ _mm_cvtpu8_ps(__m64 __a)
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    copied and converted values from the first operand. The upper 64 bits
 ///    contain the copied and converted values from the second operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 {
-  __m128 __c;
+  return __builtin_convertvector(
-
+      __builtin_shufflevector((__v2si)__a, (__v2si)__b,
-  __c = _mm_setzero_ps();
+                              0, 1, 2, 3), __v4sf);
  __c = _mm_cvtpi32_ps(__c, __b);
  __c = _mm_movelh_ps(__c, __c);
  return _mm_cvtpi32_ps(__c, __a);
 }
 /// Converts each single-precision floating-point element of a 128-bit
@ -2986,16 +2988,11 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 ///    A 128-bit floating-point vector of [4 x float].
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi16(__m128 __a)
 {
-  __m64 __b, __c;
+  return __trunc64(__builtin_ia32_packssdw128(
-
+      (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
  __b = _mm_cvtps_pi32(__a);
  __a = _mm_movehl_ps(__a, __a);
  __c = _mm_cvtps_pi32(__a);
  return _mm_packs_pi32(__b, __c);
 }
 /// Converts each single-precision floating-point element of a 128-bit
@ -3016,7 +3013,7 @@ _mm_cvtps_pi16(__m128 __a)
 ///    128-bit floating-point vector of [4 x float].
 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
 ///    converted values and the uppper 32 bits are set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi8(__m128 __a)
 {
  __m64 __b, __c;
@ -3196,8 +3193,12 @@ do { \
 #define _m_psadbw _mm_sad_pu8
 #define _m_ _mm_
 #undef __trunc64
 #undef __zext128
 #undef __anyext128
 #undef __zeroupper64
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
+#undef __DEFAULT_FN_ATTRS_SSE2
 /* Ugly hack for backwards-compatibility (compatible with gcc) */
 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@ -502,7 +502,6 @@ bool SemaX86::CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
  switch (BuiltinID) {
  default:
    return false;
  case X86::BI__builtin_ia32_vec_ext_v2si:
  case X86::BI__builtin_ia32_vec_ext_v2di:
  case X86::BI__builtin_ia32_vextractf128_pd256:
  case X86::BI__builtin_ia32_vextractf128_ps256:
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@ -1,193 +1,200 @@
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s --implicit-check-not=x86mmx
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --implicit-check-not=x86mmx
 #include <immintrin.h>
 __m64 test_mm_abs_pi8(__m64 a) {
  // CHECK-LABEL: test_mm_abs_pi8
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.b
+  // CHECK: call <8 x i8> @llvm.abs.v8i8(
  return _mm_abs_pi8(a);
 }
 __m64 test_mm_abs_pi16(__m64 a) {
  // CHECK-LABEL: test_mm_abs_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.w
+  // CHECK: call <4 x i16> @llvm.abs.v4i16(
  return _mm_abs_pi16(a);
 }
 __m64 test_mm_abs_pi32(__m64 a) {
  // CHECK-LABEL: test_mm_abs_pi32
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.d
+  // CHECK: call <2 x i32> @llvm.abs.v2i32(
  return _mm_abs_pi32(a);
 }
 __m64 test_mm_add_pi8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_add_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.b
+  // CHECK: add <8 x i8> {{%.*}}, {{%.*}}
  return _mm_add_pi8(a, b);
 }
 __m64 test_mm_add_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_add_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.w
+  // CHECK: add <4 x i16> {{%.*}}, {{%.*}}
  return _mm_add_pi16(a, b);
 }
 __m64 test_mm_add_pi32(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_add_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.d
+  // CHECK: add <2 x i32> {{%.*}}, {{%.*}}
  return _mm_add_pi32(a, b);
 }
 __m64 test_mm_add_si64(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_add_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  // CHECK: add i64 {{%.*}}, {{%.*}}
  return _mm_add_si64(a, b);
 }
 __m64 test_mm_adds_pi8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_adds_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.padds.b
+  // CHECK: call <8 x i8> @llvm.sadd.sat.v8i8(
  return _mm_adds_pi8(a, b);
 }
 __m64 test_mm_adds_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_adds_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.padds.w
+  // CHECK: call <4 x i16> @llvm.sadd.sat.v4i16(
  return _mm_adds_pi16(a, b);
 }
 __m64 test_mm_adds_pu8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_adds_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.paddus.b
+  // CHECK: call <8 x i8> @llvm.uadd.sat.v8i8(
  return _mm_adds_pu8(a, b);
 }
 __m64 test_mm_adds_pu16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_adds_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.paddus.w
+  // CHECK: call <4 x i16> @llvm.uadd.sat.v4i16(
  return _mm_adds_pu16(a, b);
 }
 __m64 test_mm_alignr_pi8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_alignr_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.palignr.b
+  // CHECK: shufflevector <16 x i8> {{%.*}}, <16 x i8> zeroinitializer, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
  return _mm_alignr_pi8(a, b, 2);
 }
 __m64 test_mm_and_si64(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_and_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.pand
+  // CHECK: and <1 x i64> {{%.*}}, {{%.*}}
  return _mm_and_si64(a, b);
 }
 __m64 test_mm_andnot_si64(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_andnot_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.pandn
+  // CHECK: [[TMP:%.*]] = xor <1 x i64> {{%.*}}, <i64 -1>
  // CHECK: and <1 x i64> [[TMP]], {{%.*}}
  return _mm_andnot_si64(a, b);
 }
 __m64 test_mm_avg_pu8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_avg_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pavg.b
+  // CHECK: call <16 x i8> @llvm.x86.sse2.pavg.b(
  return _mm_avg_pu8(a, b);
 }
 __m64 test_mm_avg_pu16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_avg_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pavg.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pavg.w(
  return _mm_avg_pu16(a, b);
 }
 __m64 test_mm_cmpeq_pi8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_cmpeq_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.b
+  // CHECK:      [[CMP:%.*]] = icmp eq <8 x i8> {{%.*}}, {{%.*}}
  // CHECK-NEXT: {{%.*}} = sext <8 x i1> [[CMP]] to <8 x i8>
  return _mm_cmpeq_pi8(a, b);
 }
 __m64 test_mm_cmpeq_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_cmpeq_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.w
+  // CHECK:      [[CMP:%.*]] = icmp eq <4 x i16> {{%.*}}, {{%.*}}
  // CHECK-NEXT: {{%.*}} = sext <4 x i1> [[CMP]] to <4 x i16>
  return _mm_cmpeq_pi16(a, b);
 }
 __m64 test_mm_cmpeq_pi32(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_cmpeq_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.d
+  // CHECK:      [[CMP:%.*]] = icmp eq <2 x i32> {{%.*}}, {{%.*}}
  // CHECK-NEXT: {{%.*}} = sext <2 x i1> [[CMP]] to <2 x i32>
  return _mm_cmpeq_pi32(a, b);
 }
 __m64 test_mm_cmpgt_pi8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_cmpgt_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.b
+  // CHECK:      [[CMP:%.*]] = icmp sgt <8 x i8> {{%.*}}, {{%.*}}
  // CHECK-NEXT: {{%.*}} = sext <8 x i1> [[CMP]] to <8 x i8>
  return _mm_cmpgt_pi8(a, b);
 }
 __m64 test_mm_cmpgt_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_cmpgt_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.w
+  // CHECK:      [[CMP:%.*]] = icmp sgt <4 x i16> {{%.*}}, {{%.*}}
  // CHECK-NEXT: {{%.*}} = sext <4 x i1> [[CMP]] to <4 x i16>
  return _mm_cmpgt_pi16(a, b);
 }
 __m64 test_mm_cmpgt_pi32(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_cmpgt_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.d
+  // CHECK:      [[CMP:%.*]] = icmp sgt <2 x i32> {{%.*}}, {{%.*}}
  // CHECK-NEXT: {{%.*}} = sext <2 x i1> [[CMP]] to <2 x i32>
  return _mm_cmpgt_pi32(a, b);
 }
 __m128 test_mm_cvt_pi2ps(__m128 a, __m64 b) {
  // CHECK-LABEL: test_mm_cvt_pi2ps
-  // CHECK: <4 x float> @llvm.x86.sse.cvtpi2ps
+  // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float>
  return _mm_cvt_pi2ps(a, b);
 }
 __m64 test_mm_cvt_ps2pi(__m128 a) {
  // CHECK-LABEL: test_mm_cvt_ps2pi
-  // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq(
  return _mm_cvt_ps2pi(a);
 }
 __m64 test_mm_cvtpd_pi32(__m128d a) {
  // CHECK-LABEL: test_mm_cvtpd_pi32
-  // CHECK: call x86_mmx @llvm.x86.sse.cvtpd2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtpd2dq(
  return _mm_cvtpd_pi32(a);
 }
 __m128 test_mm_cvtpi16_ps(__m64 a) {
  // CHECK-LABEL: test_mm_cvtpi16_ps
-  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
+  // CHECK: sitofp <4 x i16> {{%.*}} to <4 x float>
  return _mm_cvtpi16_ps(a);
 }
 __m128d test_mm_cvtpi32_pd(__m64 a) {
  // CHECK-LABEL: test_mm_cvtpi32_pd
-  // CHECK: call <2 x double> @llvm.x86.sse.cvtpi2pd
+  // CHECK: sitofp <2 x i32> {{%.*}} to <2 x double>
  return _mm_cvtpi32_pd(a);
 }
 __m128 test_mm_cvtpi32_ps(__m128 a, __m64 b) {
  // CHECK-LABEL: test_mm_cvtpi32_ps
-  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
+  // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float>
  return _mm_cvtpi32_ps(a, b);
 }
 __m128 test_mm_cvtpi32x2_ps(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_cvtpi32x2_ps
-  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
+  // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float>
  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
  return _mm_cvtpi32x2_ps(a, b);
 }
 __m64 test_mm_cvtps_pi16(__m128 a) {
  // CHECK-LABEL: test_mm_cvtps_pi16
-  // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
+  // CHECK: [[TMP0:%.*]] = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> {{%.*}})
  // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP0]],
  return _mm_cvtps_pi16(a);
 }
 __m64 test_mm_cvtps_pi32(__m128 a) {
  // CHECK-LABEL: test_mm_cvtps_pi32
-  // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq(
  return _mm_cvtps_pi32(a);
 }
@ -205,19 +212,19 @@ int test_mm_cvtsi64_si32(__m64 a) {
 __m64 test_mm_cvttpd_pi32(__m128d a) {
  // CHECK-LABEL: test_mm_cvttpd_pi32
-  // CHECK: call x86_mmx @llvm.x86.sse.cvttpd2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttpd2dq(
  return _mm_cvttpd_pi32(a);
 }
 __m64 test_mm_cvttps_pi32(__m128 a) {
  // CHECK-LABEL: test_mm_cvttps_pi32
-  // CHECK: call x86_mmx @llvm.x86.sse.cvttps2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttps2dq(
  return _mm_cvttps_pi32(a);
 }
 int test_mm_extract_pi16(__m64 a) {
  // CHECK-LABEL: test_mm_extract_pi16
-  // CHECK: call i32 @llvm.x86.mmx.pextr.w
+  // CHECK: extractelement <4 x i16> {{%.*}}, i64 2
  return _mm_extract_pi16(a, 2);
 }
@ -234,151 +241,153 @@ __m64 test_m_from_int64(long long a) {
 __m64 test_mm_hadd_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_hadd_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.w
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(
  return _mm_hadd_pi16(a, b);
 }
 __m64 test_mm_hadd_pi32(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_hadd_pi32
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.d
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(
  return _mm_hadd_pi32(a, b);
 }
 __m64 test_mm_hadds_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_hadds_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.sw
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(
  return _mm_hadds_pi16(a, b);
 }
 __m64 test_mm_hsub_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_hsub_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.w
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(
  return _mm_hsub_pi16(a, b);
 }
 __m64 test_mm_hsub_pi32(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_hsub_pi32
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.d
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(
  return _mm_hsub_pi32(a, b);
 }
 __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_hsubs_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.sw
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(
  return _mm_hsubs_pi16(a, b);
 }
 __m64 test_mm_insert_pi16(__m64 a, int d) {
  // CHECK-LABEL: test_mm_insert_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pinsr.w
+  // CHECK: insertelement <4 x i16>
  return _mm_insert_pi16(a, d, 2);
 }
 __m64 test_mm_madd_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_madd_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmadd.wd
+  // CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd(
  return _mm_madd_pi16(a, b);
 }
 __m64 test_mm_maddubs_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_maddubs_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(
  return _mm_maddubs_pi16(a, b);
 }
 void test_mm_maskmove_si64(__m64 d, __m64 n, char *p) {
  // CHECK-LABEL: test_mm_maskmove_si64
-  // CHECK: call void @llvm.x86.mmx.maskmovq
+  // CHECK: call void @llvm.x86.sse2.maskmov.dqu(
  _mm_maskmove_si64(d, n, p);
 }
 __m64 test_mm_max_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_max_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmaxs.w
+  // CHECK: call <4 x i16> @llvm.smax.v4i16(
  return _mm_max_pi16(a, b);
 }
 __m64 test_mm_max_pu8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_max_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmaxu.b
+  // CHECK: call <8 x i8> @llvm.umax.v8i8(
  return _mm_max_pu8(a, b);
 }
 __m64 test_mm_min_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_min_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmins.w
+  // CHECK: call <4 x i16> @llvm.smin.v4i16(
  return _mm_min_pi16(a, b);
 }
 __m64 test_mm_min_pu8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_min_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pminu.b
+  // CHECK: call <8 x i8> @llvm.umin.v8i8(
  return _mm_min_pu8(a, b);
 }
 int test_mm_movemask_pi8(__m64 a) {
  // CHECK-LABEL: test_mm_movemask_pi8
-  // CHECK: call i32 @llvm.x86.mmx.pmovmskb
+  // CHECK: call i32 @llvm.x86.sse2.pmovmskb.128(
  return _mm_movemask_pi8(a);
 }
 __m64 test_mm_mul_su32(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_mul_su32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  // CHECK: and <2 x i64> {{%.*}}, <i64 4294967295, i64 4294967295>
  // CHECK: and <2 x i64> {{%.*}}, <i64 4294967295, i64 4294967295>
  // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
  return _mm_mul_su32(a, b);
 }
 __m64 test_mm_mulhi_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_mulhi_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmulh.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pmulh.w(
  return _mm_mulhi_pi16(a, b);
 }
 __m64 test_mm_mulhi_pu16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_mulhi_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmulhu.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pmulhu.w(
  return _mm_mulhi_pu16(a, b);
 }
 __m64 test_mm_mulhrs_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_mulhrs_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pmul.hr.sw
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(
  return _mm_mulhrs_pi16(a, b);
 }
 __m64 test_mm_mullo_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_mullo_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmull.w
+  // CHECK: mul <4 x i16> {{%.*}}, {{%.*}}
  return _mm_mullo_pi16(a, b);
 }
 __m64 test_mm_or_si64(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_or_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.por
+  // CHECK: or <1 x i64> {{%.*}}, {{%.*}}
  return _mm_or_si64(a, b);
 }
 __m64 test_mm_packs_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_packs_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.packsswb
+  // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(
  return _mm_packs_pi16(a, b);
 }
 __m64 test_mm_packs_pi32(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_packs_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.packssdw
+  // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(
  return _mm_packs_pi32(a, b);
 }
 __m64 test_mm_packs_pu16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_packs_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.packuswb
+  // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(
  return _mm_packs_pu16(a, b);
 }
 __m64 test_mm_sad_pu8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_sad_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.psad.bw
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>
  return _mm_sad_pu8(a, b);
 }
@ -471,187 +480,187 @@ __m64 test_mm_set1_pi32(int a) {
 __m64 test_mm_shuffle_pi8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_shuffle_pi8
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pshuf.b
+  // CHECK: call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(
  return _mm_shuffle_pi8(a, b);
 }
 __m64 test_mm_shuffle_pi16(__m64 a) {
  // CHECK-LABEL: test_mm_shuffle_pi16
-  // CHECK: call x86_mmx @llvm.x86.sse.pshuf.w
+  // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
  return _mm_shuffle_pi16(a, 3);
 }
 __m64 test_mm_sign_pi8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_sign_pi8
-  // CHECK: call x86_mmx @llvm.x86.ssse3.psign.b
+  // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128(
  return _mm_sign_pi8(a, b);
 }
 __m64 test_mm_sign_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_sign_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.psign.w
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.psign.w.128(
  return _mm_sign_pi16(a, b);
 }
 __m64 test_mm_sign_pi32(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_sign_pi32
-  // CHECK: call x86_mmx @llvm.x86.ssse3.psign.d
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.psign.d.128(
  return _mm_sign_pi32(a, b);
 }
 __m64 test_mm_sll_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_sll_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psll.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w(
  return _mm_sll_pi16(a, b);
 }
 __m64 test_mm_sll_pi32(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_sll_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psll.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psll.d(
  return _mm_sll_pi32(a, b);
 }
 __m64 test_mm_sll_si64(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_sll_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psll.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psll.q(
  return _mm_sll_si64(a, b);
 }
 __m64 test_mm_slli_pi16(__m64 a) {
  // CHECK-LABEL: test_mm_slli_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pslli.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pslli.w(
  return _mm_slli_pi16(a, 3);
 }
 __m64 test_mm_slli_pi32(__m64 a) {
  // CHECK-LABEL: test_mm_slli_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pslli.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(
  return _mm_slli_pi32(a, 3);
 }
 __m64 test_mm_slli_si64(__m64 a) {
  // CHECK-LABEL: test_mm_slli_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.pslli.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q(
  return _mm_slli_si64(a, 3);
 }
 __m64 test_mm_sra_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_sra_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psra.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psra.w(
  return _mm_sra_pi16(a, b);
 }
 __m64 test_mm_sra_pi32(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_sra_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psra.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psra.d(
  return _mm_sra_pi32(a, b);
 }
 __m64 test_mm_srai_pi16(__m64 a) {
  // CHECK-LABEL: test_mm_srai_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrai.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrai.w(
  return _mm_srai_pi16(a, 3);
 }
 __m64 test_mm_srai_pi32(__m64 a) {
  // CHECK-LABEL: test_mm_srai_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrai.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrai.d(
  return _mm_srai_pi32(a, 3);
 }
 __m64 test_mm_srl_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_srl_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrl.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrl.w(
  return _mm_srl_pi16(a, b);
 }
 __m64 test_mm_srl_pi32(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_srl_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrl.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrl.d(
  return _mm_srl_pi32(a, b);
 }
 __m64 test_mm_srl_si64(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_srl_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrl.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psrl.q(
  return _mm_srl_si64(a, b);
 }
 __m64 test_mm_srli_pi16(__m64 a) {
  // CHECK-LABEL: test_mm_srli_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrli.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrli.w(
  return _mm_srli_pi16(a, 3);
 }
 __m64 test_mm_srli_pi32(__m64 a) {
  // CHECK-LABEL: test_mm_srli_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrli.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrli.d(
  return _mm_srli_pi32(a, 3);
 }
 __m64 test_mm_srli_si64(__m64 a) {
  // CHECK-LABEL: test_mm_srli_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrli.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q(
  return _mm_srli_si64(a, 3);
 }
 void test_mm_stream_pi(__m64 *p, __m64 a) {
  // CHECK-LABEL: test_mm_stream_pi
-  // CHECK: call void @llvm.x86.mmx.movnt.dq
+  // CHECK: store <1 x i64> {{%.*}}, ptr {{%.*}}, align 8, !nontemporal
  _mm_stream_pi(p, a);
 }
 void test_mm_stream_pi_void(void *p, __m64 a) {
  // CHECK-LABEL: test_mm_stream_pi_void
-  // CHECK: call void @llvm.x86.mmx.movnt.dq
+  // CHECK: store <1 x i64> {{%.*}}, ptr {{%.*}}, align 8, !nontemporal
  _mm_stream_pi(p, a);
 }
 __m64 test_mm_sub_pi8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_sub_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.b
+  // CHECK: sub <8 x i8> {{%.*}}, {{%.*}}
  return _mm_sub_pi8(a, b);
 }
 __m64 test_mm_sub_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_sub_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.w
+  // CHECK: sub <4 x i16> {{%.*}}, {{%.*}}
  return _mm_sub_pi16(a, b);
 }
 __m64 test_mm_sub_pi32(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_sub_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.d
+  // CHECK: sub <2 x i32> {{%.*}}, {{%.*}}
  return _mm_sub_pi32(a, b);
 }
 __m64 test_mm_sub_si64(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_sub_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  // CHECK: sub i64 {{%.*}}, {{%.*}}
  return _mm_sub_si64(a, b);
 }
 __m64 test_mm_subs_pi8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_subs_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.psubs.b
+  // CHECK: call <8 x i8> @llvm.ssub.sat.v8i8(
  return _mm_subs_pi8(a, b);
 }
 __m64 test_mm_subs_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_subs_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psubs.w
+  // CHECK: call <4 x i16> @llvm.ssub.sat.v4i16(
  return _mm_subs_pi16(a, b);
 }
 __m64 test_mm_subs_pu8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_subs_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.psubus.b
+  // CHECK: call <8 x i8> @llvm.usub.sat.v8i8(
  return _mm_subs_pu8(a, b);
 }
 __m64 test_mm_subs_pu16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_subs_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psubus.w
+  // CHECK: call <4 x i16> @llvm.usub.sat.v4i16(
  return _mm_subs_pu16(a, b);
 }
@ -668,42 +677,42 @@ long long test_m_to_int64(__m64 a) {
 __m64 test_mm_unpackhi_pi8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_unpackhi_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpckhbw
+  // CHECK: shufflevector <8 x i8> {{%.*}}, <8 x i8> {{%.*}}, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
  return _mm_unpackhi_pi8(a, b);
 }
 __m64 test_mm_unpackhi_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_unpackhi_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpckhwd
+  // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  return _mm_unpackhi_pi16(a, b);
 }
 __m64 test_mm_unpackhi_pi32(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_unpackhi_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpckhdq
+  // CHECK: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> <i32 1, i32 3>
  return _mm_unpackhi_pi32(a, b);
 }
 __m64 test_mm_unpacklo_pi8(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_unpacklo_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpcklbw
+  // CHECK: shufflevector <8 x i8> {{%.*}}, <8 x i8> {{%.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
  return _mm_unpacklo_pi8(a, b);
 }
 __m64 test_mm_unpacklo_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_unpacklo_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpcklwd
+  // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  return _mm_unpacklo_pi16(a, b);
 }
 __m64 test_mm_unpacklo_pi32(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_unpacklo_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpckldq
+  // CHECK: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> <i32 0, i32 2>
  return _mm_unpacklo_pi32(a, b);
 }
 __m64 test_mm_xor_si64(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_xor_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.pxor
+  // CHECK: xor <1 x i64> {{%.*}}, {{%.*}}
  return _mm_xor_si64(a, b);
 }
--- a/clang/test/CodeGen/X86/mmx-inline-asm.c
+++ b/clang/test/CodeGen/X86/mmx-inline-asm.c
@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +mmx %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +mmx -target-feature +sse2 %s -o - | FileCheck %s
 #include <mmintrin.h>
 // CHECK: { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }
--- a/clang/test/CodeGen/X86/mmx-shift-with-immediate.c
+++ b/clang/test/CodeGen/X86/mmx-shift-with-immediate.c
@ -1,23 +1,23 @@
-// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +mmx %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +sse2 %s -o - | FileCheck %s
 #include <mmintrin.h>
 void shift(__m64 a, __m64 b, int c) {
-  // CHECK: x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %{{.*}}, i32 {{.*}})
  _mm_slli_pi16(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 {{.*}})
  _mm_slli_pi32(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %{{.*}}, i32 {{.*}})
  _mm_slli_si64(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %{{.*}}, i32 {{.*}})
  _mm_srli_pi16(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %{{.*}}, i32 {{.*}})
  _mm_srli_pi32(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %{{.*}}, i32 {{.*}})
  _mm_srli_si64(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %{{.*}}, i32 {{.*}})
  _mm_srai_pi16(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %{{.*}}, i32 {{.*}})
  _mm_srai_pi32(a, c);
 }
--- a/clang/test/CodeGen/attr-target-x86-mmx.c
+++ b/clang/test/CodeGen/attr-target-x86-mmx.c
@ -1,12 +1,11 @@
 // RUN: %clang_cc1 -triple i386-linux-gnu -emit-llvm %s -o - | FileCheck %s
-// Picking a cpu that doesn't have mmx or sse by default so we can enable it later.
+// Picking a cpu that doesn't have sse by default so we can enable it later.
 #define __MM_MALLOC_H
 #include <x86intrin.h>
-// Verify that when we turn on sse that we also turn on mmx.
+void __attribute__((target("sse2"))) shift(__m64 a, __m64 b, int c) {
 void __attribute__((target("sse"))) shift(__m64 a, __m64 b, int c) {
  _mm_slli_pi16(a, c);
  _mm_slli_pi32(a, c);
  _mm_slli_si64(a, c);
@ -19,4 +18,4 @@ void __attribute__((target("sse"))) shift(__m64 a, __m64 b, int c) {
  _mm_srai_pi32(a, c);
 }
-// CHECK: "target-features"="+cx8,+mmx,+sse,+x87"
+// CHECK: "target-features"="+cx8,+mmx,+sse,+sse2,+x87"
--- a/clang/test/CodeGen/builtins-x86.c
+++ b/clang/test/CodeGen/builtins-x86.c
@ -168,26 +168,6 @@ void f0(void) {
  tmp_V4f = __builtin_ia32_minss(tmp_V4f, tmp_V4f);
  tmp_V4f = __builtin_ia32_maxss(tmp_V4f, tmp_V4f);
  tmp_V8c = __builtin_ia32_paddsb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_paddsw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_psubsb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_psubsw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_paddusb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_paddusw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_psubusb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_psubusw(tmp_V4s, tmp_V4s);
  tmp_V4s = __builtin_ia32_pmulhw(tmp_V4s, tmp_V4s);
  tmp_V4s = __builtin_ia32_pmulhuw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_pcmpeqb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_pcmpeqw(tmp_V4s, tmp_V4s);
  tmp_V2i = __builtin_ia32_pcmpeqd(tmp_V2i, tmp_V2i);
  tmp_V8c = __builtin_ia32_pcmpgtb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_pcmpgtw(tmp_V4s, tmp_V4s);
  tmp_V2i = __builtin_ia32_pcmpgtd(tmp_V2i, tmp_V2i);
  tmp_V8c = __builtin_ia32_pmaxub(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_pmaxsw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_pminub(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_pminsw(tmp_V4s, tmp_V4s);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 0);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 1);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 2);
@ -220,45 +200,17 @@ void f0(void) {
  tmp_V4f = __builtin_ia32_hsubps(tmp_V4f, tmp_V4f);
  tmp_V2d = __builtin_ia32_hsubpd(tmp_V2d, tmp_V2d);
  tmp_V8s = __builtin_ia32_phaddw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_phaddw(tmp_V4s, tmp_V4s);
  tmp_V4i = __builtin_ia32_phaddd128(tmp_V4i, tmp_V4i);
  tmp_V2i = __builtin_ia32_phaddd(tmp_V2i, tmp_V2i);
  tmp_V8s = __builtin_ia32_phaddsw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_phaddsw(tmp_V4s, tmp_V4s);
  tmp_V8s = __builtin_ia32_phsubw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_phsubw(tmp_V4s, tmp_V4s);
  tmp_V4i = __builtin_ia32_phsubd128(tmp_V4i, tmp_V4i);
  tmp_V2i = __builtin_ia32_phsubd(tmp_V2i, tmp_V2i);
  tmp_V8s = __builtin_ia32_phsubsw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_phsubsw(tmp_V4s, tmp_V4s);
  tmp_V8s = __builtin_ia32_pmaddubsw128(tmp_V16c, tmp_V16c);
  tmp_V8c = __builtin_ia32_pmaddubsw(tmp_V8c, tmp_V8c);
  tmp_V8s = __builtin_ia32_pmulhrsw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_pmulhrsw(tmp_V4s, tmp_V4s);
  tmp_V16c = __builtin_ia32_pshufb128(tmp_V16c, tmp_V16c);
  tmp_V8c = __builtin_ia32_pshufb(tmp_V8c, tmp_V8c);
  tmp_V16c = __builtin_ia32_psignb128(tmp_V16c, tmp_V16c);
  tmp_V8c = __builtin_ia32_psignb(tmp_V8c, tmp_V8c);
  tmp_V8s = __builtin_ia32_psignw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_psignw(tmp_V4s, tmp_V4s);
  tmp_V4i = __builtin_ia32_psignd128(tmp_V4i, tmp_V4i);
  tmp_V2i = __builtin_ia32_psignd(tmp_V2i, tmp_V2i);
  tmp_V8c = __builtin_ia32_pabsb(tmp_V8c);
  tmp_V4s = __builtin_ia32_pabsw(tmp_V4s);
  tmp_V2i = __builtin_ia32_pabsd(tmp_V2i);
  tmp_V4s = __builtin_ia32_psllw(tmp_V4s, tmp_V1LLi);
  tmp_V2i = __builtin_ia32_pslld(tmp_V2i, tmp_V1LLi);
  tmp_V1LLi = __builtin_ia32_psllq(tmp_V1LLi, tmp_V1LLi);
  tmp_V4s = __builtin_ia32_psrlw(tmp_V4s, tmp_V1LLi);
  tmp_V2i = __builtin_ia32_psrld(tmp_V2i, tmp_V1LLi);
  tmp_V1LLi = __builtin_ia32_psrlq(tmp_V1LLi, tmp_V1LLi);
  tmp_V4s = __builtin_ia32_psraw(tmp_V4s, tmp_V1LLi);
  tmp_V2i = __builtin_ia32_psrad(tmp_V2i, tmp_V1LLi);
  tmp_V2i = __builtin_ia32_pmaddwd(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_packsswb(tmp_V4s, tmp_V4s);
  tmp_V4s = __builtin_ia32_packssdw(tmp_V2i, tmp_V2i);
  tmp_V8c = __builtin_ia32_packuswb(tmp_V4s, tmp_V4s);
  tmp_i = __builtin_ia32_vec_ext_v2si(tmp_V2i, 0);
  __builtin_ia32_incsspd(tmp_Ui);
  __builtin_ia32_incsspq(tmp_ULLi);
@ -306,8 +258,6 @@ void f0(void) {
  (void) __builtin_ia32_clzero(tmp_vp);
  (void) __builtin_ia32_cldemote(tmp_vp);
  tmp_V4f = __builtin_ia32_cvtpi2ps(tmp_V4f, tmp_V2i);
  tmp_V2i = __builtin_ia32_cvtps2pi(tmp_V4f);
  tmp_i = __builtin_ia32_cvtss2si(tmp_V4f);
  tmp_i = __builtin_ia32_cvttss2si(tmp_V4f);
@ -320,17 +270,12 @@ void f0(void) {
  tmp_LLi = __builtin_ia32_cvtss2si64(tmp_V4f);
  tmp_LLi = __builtin_ia32_cvttss2si64(tmp_V4f);
 #endif
  tmp_V2i = __builtin_ia32_cvttps2pi(tmp_V4f);
  (void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp);
  tmp_i = __builtin_ia32_movmskps(tmp_V4f);
  tmp_i = __builtin_ia32_pmovmskb(tmp_V8c);
  (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi);
  (void) __builtin_ia32_sfence();
 #ifndef OPENCL
  (void) _mm_sfence();
 #endif
  tmp_V4s = __builtin_ia32_psadbw(tmp_V8c, tmp_V8c);
  tmp_V4f = __builtin_ia32_rcpps(tmp_V4f);
  tmp_V4f = __builtin_ia32_rcpss(tmp_V4f);
  tmp_V4f = __builtin_ia32_rsqrtps(tmp_V4f);
@ -348,11 +293,8 @@ void f0(void) {
  tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d);
  tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d);
  tmp_V2LLi = __builtin_ia32_cvtpd2dq(tmp_V2d);
  tmp_V2i = __builtin_ia32_cvtpd2pi(tmp_V2d);
  tmp_V4f = __builtin_ia32_cvtpd2ps(tmp_V2d);
  tmp_V4i = __builtin_ia32_cvttpd2dq(tmp_V2d);
  tmp_V2i = __builtin_ia32_cvttpd2pi(tmp_V2d);
  tmp_V2d = __builtin_ia32_cvtpi2pd(tmp_V2i);
  tmp_i = __builtin_ia32_cvtsd2si(tmp_V2d);
  tmp_i = __builtin_ia32_cvttsd2si(tmp_V2d);
  tmp_V4f = __builtin_ia32_cvtsd2ss(tmp_V4f, tmp_V2d);
@ -379,26 +321,9 @@ void f0(void) {
  (void) _mm_pause();
 #endif
  tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, imm_i_0_8);
  tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, imm_i_0_8);
  tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, imm_i_0_8);
  tmp_V4s = __builtin_ia32_psrawi(tmp_V4s, imm_i_0_8);
  tmp_V2i = __builtin_ia32_psradi(tmp_V2i, imm_i_0_8);
  tmp_V4s = __builtin_ia32_psrlwi(tmp_V4s, imm_i_0_8);
  tmp_V2i = __builtin_ia32_psrldi(tmp_V2i, imm_i_0_8);
  tmp_V1LLi = __builtin_ia32_psrlqi(tmp_V1LLi, imm_i_0_8);
  // Using non-immediate argument supported for gcc compatibility
  tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, tmp_i);
  tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, tmp_i);
  tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, tmp_i);
  tmp_V4s = __builtin_ia32_psrawi(tmp_V4s, tmp_i);
  tmp_V2i = __builtin_ia32_psradi(tmp_V2i, tmp_i);
  tmp_V4s = __builtin_ia32_psrlwi(tmp_V4s, tmp_i);
  tmp_V2i = __builtin_ia32_psrldi(tmp_V2i, tmp_i);
  tmp_V1LLi = __builtin_ia32_psrlqi(tmp_V1LLi, tmp_i);
  tmp_V1LLi = __builtin_ia32_pmuludq(tmp_V2i, tmp_V2i);
  tmp_V2LLi = __builtin_ia32_pmuludq128(tmp_V4i, tmp_V4i);
  tmp_V8s = __builtin_ia32_psraw128(tmp_V8s, tmp_V8s);
  tmp_V4i = __builtin_ia32_psrad128(tmp_V4i, tmp_V4i);
@ -433,7 +358,6 @@ void f0(void) {
  (void) __builtin_ia32_mwait(tmp_Ui, tmp_Ui);
  tmp_V16c = __builtin_ia32_lddqu(tmp_cCp);
  tmp_V16c = __builtin_ia32_palignr128(tmp_V16c, tmp_V16c, imm_i);
  tmp_V8c = __builtin_ia32_palignr(tmp_V8c, tmp_V8c, imm_i);
 #ifdef USE_SSE4
  tmp_V16c = __builtin_ia32_pblendvb128(tmp_V16c, tmp_V16c, tmp_V16c);
  tmp_V2d = __builtin_ia32_blendvpd(tmp_V2d, tmp_V2d, tmp_V2d);
--- a/clang/test/CodeGen/palignr.c
+++ b/clang/test/CodeGen/palignr.c
@ -14,18 +14,3 @@ int4 align2(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 16); }
 int4 align3(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 17); }
 // CHECK: xor
 int4 align4(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 32); }
 #define _mm_alignr_pi8(a, b, n) (__builtin_ia32_palignr((a), (b), (n)))
 typedef __attribute__((vector_size(8))) int int2;
 // CHECK: palignr
 int2 align5(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 8); }
 // CHECK: palignr
 int2 align6(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 9); }
 // CHECK: palignr
 int2 align7(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 16); }
 // CHECK: palignr
 int2 align8(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 7); }
--- a/clang/test/CodeGen/pr26099.c
+++ b/clang/test/CodeGen/pr26099.c
@ -1,12 +0,0 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=i686-apple-darwin -target-feature +mmx -emit-llvm -o - -Wall -Werror
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +mmx -emit-llvm -o - -Wall -Werror
 // REQUIRES: asserts
 #include <x86intrin.h>
 int __attribute__ ((__vector_size__ (8))) b;
 void bar(int a)
 {
  b = __builtin_ia32_vec_init_v2si (0, a);
 }
--- a/clang/test/Headers/xmmintrin.c
+++ b/clang/test/Headers/xmmintrin.c
@ -14,7 +14,7 @@ _MM_ALIGN16 char c;
 // checking that clang emits PACKSSDW instead of PACKSSWB.
 // CHECK: define{{.*}} i64 @test_mm_cvtps_pi16
-// CHECK: call x86_mmx @llvm.x86.mmx.packssdw
+// CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128
 __m64 test_mm_cvtps_pi16(__m128 a) {
  return _mm_cvtps_pi16(a);
--- a/clang/test/Sema/x86-builtin-palignr.c
+++ b/clang/test/Sema/x86-builtin-palignr.c
@ -4,5 +4,5 @@
 #include <tmmintrin.h>
 __m64 test1(__m64 a, __m64 b, int c) {
-   return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_palignr' must be a constant integer}}
+   return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_psrldqi128_byteshift' must be a constant integer}}
 }
--- a/clang/www/builtins.py
+++ b/clang/www/builtins.py
@ -4,8 +4,9 @@ import sys, fileinput
 err = 0
-# Giant associative set of builtin->intrinsic mappings where clang doesn't
+# Giant associative set of builtin->intrinsic mappings where clang
-# implement the builtin since the vector operation works by default.
+# doesn't implement the builtin. (Either because the vector operation
 # works without a builtin, or for other reasons.)
 repl_map = {
    "__builtin_ia32_addps": "_mm_add_ps",
@ -134,6 +135,99 @@ repl_map = {
    "__builtin_ia32_vec_ext_v2di": "_mm_extract_epi64",
    "__builtin_ia32_vec_ext_v4hi": "_mm_extract_pi16",
    "__builtin_ia32_vec_ext_v4sf": "_mm_extract_ps",
    # Removed MMX builtins
    "__builtin_ia32_paddb": "_mm_add_pi8",
    "__builtin_ia32_paddw": "_mm_add_pi16",
    "__builtin_ia32_paddd": "_mm_add_pi32",
    "__builtin_ia32_paddsb": "_mm_adds_pi8",
    "__builtin_ia32_paddsw": "_mm_adds_pi16",
    "__builtin_ia32_paddusb": "_mm_adds_pu8",
    "__builtin_ia32_paddusw": "_mm_adds_pu16",
    "__builtin_ia32_psubb": "_mm_sub_pi8",
    "__builtin_ia32_psubw": "_mm_sub_pi16",
    "__builtin_ia32_psubd": "_mm_sub_pi32",
    "__builtin_ia32_psubsb": "_mm_subs_pi8",
    "__builtin_ia32_psubsw": "_mm_subs_pi16",
    "__builtin_ia32_psubusb": "_mm_subs_pu8",
    "__builtin_ia32_psubusw": "_mm_subs_pu16",
    "__builtin_ia32_pmulhw": "_mm_mulhi_pi16",
    "__builtin_ia32_pmullw": "_mm_mullo_pi16",
    "__builtin_ia32_pmaddwd": "_mm_madd_pi16",
    "__builtin_ia32_pand": "_mm_and_si64",
    "__builtin_ia32_pandn": "_mm_andnot_si64",
    "__builtin_ia32_por": "_mm_or_si64",
    "__builtin_ia32_pxor": "_mm_xor_si64",
    "__builtin_ia32_psllw": "_mm_sll_pi16",
    "__builtin_ia32_pslld": "_mm_sll_pi32",
    "__builtin_ia32_psllq": "_mm_sll_si64",
    "__builtin_ia32_psrlw": "_mm_srl_pi16",
    "__builtin_ia32_psrld": "_mm_srl_pi32",
    "__builtin_ia32_psrlq": "_mm_srl_si64",
    "__builtin_ia32_psraw": "_mm_sra_pi16",
    "__builtin_ia32_psrad": "_mm_sra_pi32",
    "__builtin_ia32_psllwi": "_mm_slli_pi16",
    "__builtin_ia32_pslldi": "_mm_slli_pi32",
    "__builtin_ia32_psllqi": "_mm_slli_si64",
    "__builtin_ia32_psrlwi": "_mm_srli_pi16",
    "__builtin_ia32_psrldi": "_mm_srli_pi32",
    "__builtin_ia32_psrlqi": "_mm_srli_si64",
    "__builtin_ia32_psrawi": "_mm_srai_pi16",
    "__builtin_ia32_psradi": "_mm_srai_pi32",
    "__builtin_ia32_packsswb": "_mm_packs_pi16",
    "__builtin_ia32_packssdw": "_mm_packs_pi32",
    "__builtin_ia32_packuswb": "_mm_packs_pu16",
    "__builtin_ia32_punpckhbw": "_mm_unpackhi_pi8",
    "__builtin_ia32_punpckhwd": "_mm_unpackhi_pi16",
    "__builtin_ia32_punpckhdq": "_mm_unpackhi_pi32",
    "__builtin_ia32_punpcklbw": "_mm_unpacklo_pi8",
    "__builtin_ia32_punpcklwd": "_mm_unpacklo_pi16",
    "__builtin_ia32_punpckldq": "_mm_unpacklo_pi32",
    "__builtin_ia32_pcmpeqb": "_mm_cmpeq_pi8",
    "__builtin_ia32_pcmpeqw": "_mm_cmpeq_pi16",
    "__builtin_ia32_pcmpeqd": "_mm_cmpeq_pi32",
    "__builtin_ia32_pcmpgtb": "_mm_cmpgt_pi8",
    "__builtin_ia32_pcmpgtw": "_mm_cmpgt_pi16",
    "__builtin_ia32_pcmpgtd": "_mm_cmpgt_pi32",
    "__builtin_ia32_maskmovq": "_mm_maskmove_si64",
    "__builtin_ia32_movntq": "_mm_stream_pi",
    "__builtin_ia32_vec_init_v2si": "_mm_setr_pi32",
    "__builtin_ia32_vec_init_v4hi": "_mm_setr_pi16",
    "__builtin_ia32_vec_init_v8qi": "_mm_setr_pi8",
    "__builtin_ia32_cvtpi2ps": "_mm_cvtpi32_ps",
    "__builtin_ia32_cvtps2pi": "_mm_cvtps_pi32",
    "__builtin_ia32_cvttps2pi": "_mm_cvttps_pi32",
    "__builtin_ia32_pavgb": "_mm_avg_pu8",
    "__builtin_ia32_pavgw": "_mm_avg_pu16",
    "__builtin_ia32_pmaxsw": "_mm_max_pi16",
    "__builtin_ia32_pmaxub": "_mm_max_pu8",
    "__builtin_ia32_pminsw": "_mm_min_pi16",
    "__builtin_ia32_pminub": "_mm_min_pu8",
    "__builtin_ia32_pmovmskb": "_mm_movemask_pi8",
    "__builtin_ia32_pmulhuw": "_mm_mulhi_pu16",
    "__builtin_ia32_psadbw": "_mm_sad_pu8",
    "__builtin_ia32_pshufw": "_mm_shuffle_pi16",
    "__builtin_ia32_cvtpd2pi": "_mm_cvtpd_pi32",
    "__builtin_ia32_cvtpi2pd": "_mm_cvtpi32_pd",
    "__builtin_ia32_cvttpd2pi": "_mm_cvttpd_pi32",
    "__builtin_ia32_paddq": "_mm_add_si64",
    "__builtin_ia32_pmuludq": "_mm_mul_su32",
    "__builtin_ia32_psubq": "_mm_sub_si64",
    "__builtin_ia32_pabsb": "_mm_abs_pi8",
    "__builtin_ia32_pabsd": "_mm_abs_pi32",
    "__builtin_ia32_pabsw": "_mm_abs_pi16",
    "__builtin_ia32_palignr": "_mm_alignr_pi8",
    "__builtin_ia32_phaddd": "_mm_hadd_pi32",
    "__builtin_ia32_phaddsw": "_mm_hadds_pi16",
    "__builtin_ia32_phaddw": "_mm_hadd_pi16",
    "__builtin_ia32_phsubd": "_mm_hsub_pi32",
    "__builtin_ia32_phsubsw": "_mm_hsubs_pi16",
    "__builtin_ia32_phsubw": "_mm_hsub_pi16",
    "__builtin_ia32_pmaddubsw": "_mm_maddubs_pi16",
    "__builtin_ia32_pmulhrsw": "_mm_mulhrs_pi16",
    "__builtin_ia32_pshufb": "_mm_shuffle_pi8",
    "__builtin_ia32_psignw": "_mm_sign_pi16",
    "__builtin_ia32_psignb": "_mm_sign_pi8",
    "__builtin_ia32_psignd": "_mm_sign_pi32",
 }
 # Special unhandled cases:
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@ -222,11 +222,11 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
  def int_x86_sse_cvttss2si64 : ClangBuiltin<"__builtin_ia32_cvttss2si64">,
      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_cvtps2pi : ClangBuiltin<"__builtin_ia32_cvtps2pi">,
+  def int_x86_sse_cvtps2pi :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_cvttps2pi: ClangBuiltin<"__builtin_ia32_cvttps2pi">,
+  def int_x86_sse_cvttps2pi:
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_cvtpi2ps : ClangBuiltin<"__builtin_ia32_cvtpi2ps">,
+  def int_x86_sse_cvtpi2ps :
      DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                             llvm_x86mmx_ty], [IntrNoMem]>;
 }
@ -426,11 +426,11 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
  def int_x86_sse2_cvtsd2ss : ClangBuiltin<"__builtin_ia32_cvtsd2ss">,
      DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                             llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse_cvtpd2pi : ClangBuiltin<"__builtin_ia32_cvtpd2pi">,
+  def int_x86_sse_cvtpd2pi :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse_cvttpd2pi: ClangBuiltin<"__builtin_ia32_cvttpd2pi">,
+  def int_x86_sse_cvttpd2pi:
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse_cvtpi2pd : ClangBuiltin<"__builtin_ia32_cvtpi2pd">,
+  def int_x86_sse_cvtpi2pd :
      DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 }
@ -512,49 +512,49 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Horizontal arithmetic ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_phadd_w         : ClangBuiltin<"__builtin_ia32_phaddw">,
+  def int_x86_ssse3_phadd_w         :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                             llvm_x86mmx_ty], [IntrNoMem]>;
  def int_x86_ssse3_phadd_w_128     : ClangBuiltin<"__builtin_ia32_phaddw128">,
      DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                             llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_ssse3_phadd_d         : ClangBuiltin<"__builtin_ia32_phaddd">,
+  def int_x86_ssse3_phadd_d         :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                             llvm_x86mmx_ty], [IntrNoMem]>;
  def int_x86_ssse3_phadd_d_128     : ClangBuiltin<"__builtin_ia32_phaddd128">,
      DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                             llvm_v4i32_ty], [IntrNoMem]>;
-  def int_x86_ssse3_phadd_sw        : ClangBuiltin<"__builtin_ia32_phaddsw">,
+  def int_x86_ssse3_phadd_sw        :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                             llvm_x86mmx_ty], [IntrNoMem]>;
  def int_x86_ssse3_phadd_sw_128    : ClangBuiltin<"__builtin_ia32_phaddsw128">,
      DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                             llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_ssse3_phsub_w         : ClangBuiltin<"__builtin_ia32_phsubw">,
+  def int_x86_ssse3_phsub_w         :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                             llvm_x86mmx_ty], [IntrNoMem]>;
  def int_x86_ssse3_phsub_w_128     : ClangBuiltin<"__builtin_ia32_phsubw128">,
      DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                             llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_ssse3_phsub_d         : ClangBuiltin<"__builtin_ia32_phsubd">,
+  def int_x86_ssse3_phsub_d         :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                             llvm_x86mmx_ty], [IntrNoMem]>;
  def int_x86_ssse3_phsub_d_128     : ClangBuiltin<"__builtin_ia32_phsubd128">,
      DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                             llvm_v4i32_ty], [IntrNoMem]>;
-  def int_x86_ssse3_phsub_sw        : ClangBuiltin<"__builtin_ia32_phsubsw">,
+  def int_x86_ssse3_phsub_sw        :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                             llvm_x86mmx_ty], [IntrNoMem]>;
  def int_x86_ssse3_phsub_sw_128    : ClangBuiltin<"__builtin_ia32_phsubsw128">,
      DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                             llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_ssse3_pmadd_ub_sw     : ClangBuiltin<"__builtin_ia32_pmaddubsw">,
+  def int_x86_ssse3_pmadd_ub_sw     :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                             llvm_x86mmx_ty], [IntrNoMem]>;
  def int_x86_ssse3_pmadd_ub_sw_128 : ClangBuiltin<"__builtin_ia32_pmaddubsw128">,
@ -564,7 +564,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Packed multiply high with round and scale
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_pmul_hr_sw      : ClangBuiltin<"__builtin_ia32_pmulhrsw">,
+  def int_x86_ssse3_pmul_hr_sw      :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                             llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
  def int_x86_ssse3_pmul_hr_sw_128  : ClangBuiltin<"__builtin_ia32_pmulhrsw128">,
@ -574,34 +574,34 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Shuffle ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_pshuf_b         : ClangBuiltin<"__builtin_ia32_pshufb">,
+  def int_x86_ssse3_pshuf_b         :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                             llvm_x86mmx_ty], [IntrNoMem]>;
  def int_x86_ssse3_pshuf_b_128     : ClangBuiltin<"__builtin_ia32_pshufb128">,
      DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
                             llvm_v16i8_ty], [IntrNoMem]>;
-  def int_x86_sse_pshuf_w           : ClangBuiltin<"__builtin_ia32_pshufw">,
+  def int_x86_sse_pshuf_w           :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i8_ty],
                             [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 // Sign ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_psign_b         : ClangBuiltin<"__builtin_ia32_psignb">,
+  def int_x86_ssse3_psign_b         :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                             llvm_x86mmx_ty], [IntrNoMem]>;
  def int_x86_ssse3_psign_b_128     : ClangBuiltin<"__builtin_ia32_psignb128">,
      DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
                             llvm_v16i8_ty], [IntrNoMem]>;
-  def int_x86_ssse3_psign_w         : ClangBuiltin<"__builtin_ia32_psignw">,
+  def int_x86_ssse3_psign_w         :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                             llvm_x86mmx_ty], [IntrNoMem]>;
  def int_x86_ssse3_psign_w_128     : ClangBuiltin<"__builtin_ia32_psignw128">,
      DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                             llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_ssse3_psign_d         : ClangBuiltin<"__builtin_ia32_psignd">,
+  def int_x86_ssse3_psign_d         :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                             llvm_x86mmx_ty], [IntrNoMem]>;
  def int_x86_ssse3_psign_d_128     : ClangBuiltin<"__builtin_ia32_psignd128">,
@ -611,13 +611,13 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Absolute value ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_pabs_b     : ClangBuiltin<"__builtin_ia32_pabsb">,
+  def int_x86_ssse3_pabs_b     :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_ssse3_pabs_w     : ClangBuiltin<"__builtin_ia32_pabsw">,
+  def int_x86_ssse3_pabs_w     :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_ssse3_pabs_d     : ClangBuiltin<"__builtin_ia32_pabsd">,
+  def int_x86_ssse3_pabs_d     :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 }
@ -2260,118 +2260,118 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Integer arithmetic ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
  // Addition
-  def int_x86_mmx_padd_b : ClangBuiltin<"__builtin_ia32_paddb">,
+  def int_x86_mmx_padd_b :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padd_w : ClangBuiltin<"__builtin_ia32_paddw">,
+  def int_x86_mmx_padd_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padd_d : ClangBuiltin<"__builtin_ia32_paddd">,
+  def int_x86_mmx_padd_d :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padd_q : ClangBuiltin<"__builtin_ia32_paddq">,
+  def int_x86_mmx_padd_q :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padds_b : ClangBuiltin<"__builtin_ia32_paddsb">,
+  def int_x86_mmx_padds_b :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padds_w : ClangBuiltin<"__builtin_ia32_paddsw">,
+  def int_x86_mmx_padds_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_paddus_b : ClangBuiltin<"__builtin_ia32_paddusb">,
+  def int_x86_mmx_paddus_b :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_paddus_w : ClangBuiltin<"__builtin_ia32_paddusw">,
+  def int_x86_mmx_paddus_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
  // Subtraction
-  def int_x86_mmx_psub_b : ClangBuiltin<"__builtin_ia32_psubb">,
+  def int_x86_mmx_psub_b :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psub_w : ClangBuiltin<"__builtin_ia32_psubw">,
+  def int_x86_mmx_psub_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psub_d : ClangBuiltin<"__builtin_ia32_psubd">,
+  def int_x86_mmx_psub_d :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psub_q : ClangBuiltin<"__builtin_ia32_psubq">,
+  def int_x86_mmx_psub_q :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psubs_b : ClangBuiltin<"__builtin_ia32_psubsb">,
+  def int_x86_mmx_psubs_b :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psubs_w : ClangBuiltin<"__builtin_ia32_psubsw">,
+  def int_x86_mmx_psubs_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psubus_b : ClangBuiltin<"__builtin_ia32_psubusb">,
+  def int_x86_mmx_psubus_b :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psubus_w : ClangBuiltin<"__builtin_ia32_psubusw">,
+  def int_x86_mmx_psubus_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
  // Multiplication
-  def int_x86_mmx_pmulh_w : ClangBuiltin<"__builtin_ia32_pmulhw">,
+  def int_x86_mmx_pmulh_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmull_w : ClangBuiltin<"__builtin_ia32_pmullw">,
+  def int_x86_mmx_pmull_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmulhu_w : ClangBuiltin<"__builtin_ia32_pmulhuw">,
+  def int_x86_mmx_pmulhu_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmulu_dq : ClangBuiltin<"__builtin_ia32_pmuludq">,
+  def int_x86_mmx_pmulu_dq :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmadd_wd : ClangBuiltin<"__builtin_ia32_pmaddwd">,
+  def int_x86_mmx_pmadd_wd :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
  // Bitwise operations
-  def int_x86_mmx_pand : ClangBuiltin<"__builtin_ia32_pand">,
+  def int_x86_mmx_pand :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pandn : ClangBuiltin<"__builtin_ia32_pandn">,
+  def int_x86_mmx_pandn :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_por : ClangBuiltin<"__builtin_ia32_por">,
+  def int_x86_mmx_por :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pxor : ClangBuiltin<"__builtin_ia32_pxor">,
+  def int_x86_mmx_pxor :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
  // Averages
-  def int_x86_mmx_pavg_b : ClangBuiltin<"__builtin_ia32_pavgb">,
+  def int_x86_mmx_pavg_b :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pavg_w : ClangBuiltin<"__builtin_ia32_pavgw">,
+  def int_x86_mmx_pavg_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
  // Maximum
-  def int_x86_mmx_pmaxu_b : ClangBuiltin<"__builtin_ia32_pmaxub">,
+  def int_x86_mmx_pmaxu_b :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmaxs_w : ClangBuiltin<"__builtin_ia32_pmaxsw">,
+  def int_x86_mmx_pmaxs_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
  // Minimum
-  def int_x86_mmx_pminu_b : ClangBuiltin<"__builtin_ia32_pminub">,
+  def int_x86_mmx_pminu_b :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmins_w : ClangBuiltin<"__builtin_ia32_pminsw">,
+  def int_x86_mmx_pmins_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
  // Packed sum of absolute differences
-  def int_x86_mmx_psad_bw : ClangBuiltin<"__builtin_ia32_psadbw">,
+  def int_x86_mmx_psad_bw :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
 }
@ -2379,58 +2379,58 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Integer shift ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
  // Shift left logical
-  def int_x86_mmx_psll_w : ClangBuiltin<"__builtin_ia32_psllw">,
+  def int_x86_mmx_psll_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psll_d : ClangBuiltin<"__builtin_ia32_pslld">,
+  def int_x86_mmx_psll_d :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psll_q : ClangBuiltin<"__builtin_ia32_psllq">,
+  def int_x86_mmx_psll_q :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psrl_w : ClangBuiltin<"__builtin_ia32_psrlw">,
+  def int_x86_mmx_psrl_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psrl_d : ClangBuiltin<"__builtin_ia32_psrld">,
+  def int_x86_mmx_psrl_d :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psrl_q : ClangBuiltin<"__builtin_ia32_psrlq">,
+  def int_x86_mmx_psrl_q :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psra_w : ClangBuiltin<"__builtin_ia32_psraw">,
+  def int_x86_mmx_psra_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psra_d : ClangBuiltin<"__builtin_ia32_psrad">,
+  def int_x86_mmx_psra_d :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
  // Oddly these don't require an immediate due to a gcc compatibility issue.
-  def int_x86_mmx_pslli_w : ClangBuiltin<"__builtin_ia32_psllwi">,
+  def int_x86_mmx_pslli_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_pslli_d : ClangBuiltin<"__builtin_ia32_pslldi">,
+  def int_x86_mmx_pslli_d :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_pslli_q : ClangBuiltin<"__builtin_ia32_psllqi">,
+  def int_x86_mmx_pslli_q :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psrli_w : ClangBuiltin<"__builtin_ia32_psrlwi">,
+  def int_x86_mmx_psrli_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psrli_d : ClangBuiltin<"__builtin_ia32_psrldi">,
+  def int_x86_mmx_psrli_d :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psrli_q : ClangBuiltin<"__builtin_ia32_psrlqi">,
+  def int_x86_mmx_psrli_q :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psrai_w : ClangBuiltin<"__builtin_ia32_psrawi">,
+  def int_x86_mmx_psrai_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_psrai_d : ClangBuiltin<"__builtin_ia32_psradi">,
+  def int_x86_mmx_psrai_d :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                            [IntrNoMem]>;
 }
@ -2475,83 +2475,83 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 }
 // Pack ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_packsswb : ClangBuiltin<"__builtin_ia32_packsswb">,
+  def int_x86_mmx_packsswb :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_packssdw : ClangBuiltin<"__builtin_ia32_packssdw">,
+  def int_x86_mmx_packssdw :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_packuswb : ClangBuiltin<"__builtin_ia32_packuswb">,
+  def int_x86_mmx_packuswb :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
 }
 // Unpacking ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_punpckhbw : ClangBuiltin<"__builtin_ia32_punpckhbw">,
+  def int_x86_mmx_punpckhbw :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_punpckhwd : ClangBuiltin<"__builtin_ia32_punpckhwd">,
+  def int_x86_mmx_punpckhwd :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_punpckhdq : ClangBuiltin<"__builtin_ia32_punpckhdq">,
+  def int_x86_mmx_punpckhdq :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_punpcklbw : ClangBuiltin<"__builtin_ia32_punpcklbw">,
+  def int_x86_mmx_punpcklbw :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_punpcklwd : ClangBuiltin<"__builtin_ia32_punpcklwd">,
+  def int_x86_mmx_punpcklwd :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_punpckldq : ClangBuiltin<"__builtin_ia32_punpckldq">,
+  def int_x86_mmx_punpckldq :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
 }
 // Integer comparison ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_pcmpeq_b : ClangBuiltin<"__builtin_ia32_pcmpeqb">,
+  def int_x86_mmx_pcmpeq_b :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pcmpeq_w : ClangBuiltin<"__builtin_ia32_pcmpeqw">,
+  def int_x86_mmx_pcmpeq_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pcmpeq_d : ClangBuiltin<"__builtin_ia32_pcmpeqd">,
+  def int_x86_mmx_pcmpeq_d :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pcmpgt_b : ClangBuiltin<"__builtin_ia32_pcmpgtb">,
+  def int_x86_mmx_pcmpgt_b :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_pcmpgt_w : ClangBuiltin<"__builtin_ia32_pcmpgtw">,
+  def int_x86_mmx_pcmpgt_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
-  def int_x86_mmx_pcmpgt_d : ClangBuiltin<"__builtin_ia32_pcmpgtd">,
+  def int_x86_mmx_pcmpgt_d :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                            [IntrNoMem]>;
 }
 // Misc.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_maskmovq : ClangBuiltin<"__builtin_ia32_maskmovq">,
+  def int_x86_mmx_maskmovq :
              Intrinsic<[], [llvm_x86mmx_ty, llvm_x86mmx_ty, llvm_ptr_ty], []>;
-  def int_x86_mmx_pmovmskb : ClangBuiltin<"__builtin_ia32_pmovmskb">,
+  def int_x86_mmx_pmovmskb :
      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_mmx_movnt_dq : ClangBuiltin<"__builtin_ia32_movntq">,
+  def int_x86_mmx_movnt_dq :
              Intrinsic<[], [llvm_ptr_ty, llvm_x86mmx_ty], []>;
-  def int_x86_mmx_palignr_b : ClangBuiltin<"__builtin_ia32_palignr">,
+  def int_x86_mmx_palignr_b :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty],
                            [llvm_x86mmx_ty, llvm_x86mmx_ty, llvm_i8_ty],
                            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_mmx_pextr_w : ClangBuiltin<"__builtin_ia32_vec_ext_v4hi">,
+  def int_x86_mmx_pextr_w :
      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_mmx_pinsr_w : ClangBuiltin<"__builtin_ia32_vec_set_v4hi">,
+  def int_x86_mmx_pinsr_w :
      DefaultAttrsIntrinsic<[llvm_x86mmx_ty],
                            [llvm_x86mmx_ty, llvm_i32_ty, llvm_i32_ty],
                            [IntrNoMem, ImmArg<ArgIndex<2>>]>;