diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs
index 10d71f7aad23..26d3ba1d7cbb 100644
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@@ -123,9 +123,12 @@ impl ABIMachineSpec for X64ABIMachineSpec {
         // extension annotations. Additionally, handling extension attributes this way allows clif
         // functions that use them with the Winch calling convention to interact successfully with
         // testing infrastructure.
+        // The results are also not packed if any of the types are `f16`. This is to simplify the
+        // implementation of `Inst::load`/`Inst::store` (which would otherwise require multiple
+        // instructions), and doesn't affect Winch itself as Winch doesn't support `f16` at all.
         let uses_extension = params
             .iter()
-            .any(|p| p.extension != ir::ArgumentExtension::None);
+            .any(|p| p.extension != ir::ArgumentExtension::None || p.value_type == types::F16);
 
         for (ix, param) in params.iter().enumerate() {
             let last_param = ix == params.len() - 1;
@@ -169,13 +172,23 @@ impl ABIMachineSpec for X64ABIMachineSpec {
             // https://godbolt.org/z/PhG3ob
 
             if param.value_type.bits() > 64
-                && !param.value_type.is_vector()
+                && !(param.value_type.is_vector() || param.value_type.is_float())
                 && !flags.enable_llvm_abi_extensions()
             {
                 panic!(
                     "i128 args/return values not supported unless LLVM ABI extensions are enabled"
                 );
             }
+            // As MSVC doesn't support f16/f128 there is no standard way to pass/return them with
+            // the Windows ABI. LLVM passes/returns them in XMM registers.
+            if matches!(param.value_type, types::F16 | types::F128)
+                && is_fastcall
+                && !flags.enable_llvm_abi_extensions()
+            {
+                panic!(
+                    "f16/f128 args/return values not supported for windows_fastcall unless LLVM ABI extensions are enabled"
+                );
+            }
 
             // Windows fastcall dictates that `__m128i` parameters to a function
             // are passed indirectly as pointers, so handle that as a special
@@ -410,12 +423,20 @@ impl ABIMachineSpec for X64ABIMachineSpec {
         // bits as well -- see `Inst::store()`).
         let ty = match ty {
             types::I8 | types::I16 | types::I32 => types::I64,
+            // Stack slots are always at least 8 bytes, so it's fine to load 4 bytes instead of only
+            // two.
+            types::F16 => types::F32,
             _ => ty,
         };
         Inst::load(ty, mem, into_reg, ExtKind::None)
     }
 
     fn gen_store_stack(mem: StackAMode, from_reg: Reg, ty: Type) -> Self::I {
+        let ty = match ty {
+            // See `gen_load_stack`.
+            types::F16 => types::F32,
+            _ => ty,
+        };
         Inst::store(ty, from_reg, mem)
     }
 
@@ -502,6 +523,11 @@ impl ABIMachineSpec for X64ABIMachineSpec {
     }
 
     fn gen_store_base_offset(base: Reg, offset: i32, from_reg: Reg, ty: Type) -> Self::I {
+        let ty = match ty {
+            // See `gen_load_stack`.
+            types::F16 => types::F32,
+            _ => ty,
+        };
         let mem = Amode::imm_reg(offset, base);
         Inst::store(ty, from_reg, mem)
     }
diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
index 2cd101be3936..e104548a4b05 100644
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -1644,7 +1644,7 @@
 (rule (put_in_gpr val)
       (if-let (value_type ty) val)
       (if-let (type_register_class (RegisterClass.Xmm)) ty)
-      (bitcast_xmm_to_gpr ty (xmm_new (put_in_reg val))))
+      (bitcast_xmm_to_gpr (ty_bits ty) (xmm_new (put_in_reg val))))
 
 ;; Put a value into a `GprMem`.
 ;;
@@ -2252,8 +2252,10 @@
 
 ;; Performs an xor operation of the two operands specified.
 (decl x64_xor_vector (Type Xmm XmmMem) Xmm)
+(rule 1 (x64_xor_vector $F16 x y) (x64_xorps x y))
 (rule 1 (x64_xor_vector $F32 x y) (x64_xorps x y))
 (rule 1 (x64_xor_vector $F64 x y) (x64_xorpd x y))
+(rule 1 (x64_xor_vector $F128 x y) (x64_xorps x y))
 (rule 1 (x64_xor_vector $F32X4 x y) (x64_xorps x y))
 (rule 1 (x64_xor_vector $F64X2 x y) (x64_xorpd x y))
 (rule 0 (x64_xor_vector (multi_lane _ _) x y) (x64_pxor x y))
@@ -2304,6 +2306,9 @@
 (rule 2 (x64_load $F64 addr _ext_kind)
       (x64_movsd_load addr))
 
+(rule 2 (x64_load $F128 addr _ext_kind)
+      (x64_movdqu_load addr))
+
 (rule 2 (x64_load $F32X4 addr _ext_kind)
       (x64_movups_load addr))
 
@@ -2719,6 +2724,10 @@
             (_ Unit (emit (MInst.Imm size simm64 dst))))
         dst))
 
+;; `f16` immediates.
+(rule 2 (imm $F16 (u64_nonzero bits))
+        (bitcast_gpr_to_xmm 16 (imm $I16 bits)))
+
 ;; `f32` immediates.
 (rule 2 (imm $F32 (u64_nonzero bits))
         (x64_movd_to_xmm (imm $I32 bits)))
@@ -2746,6 +2755,9 @@
 (rule 0 (imm ty @ (multi_lane _bits _lanes) 0)
       (xmm_to_reg (xmm_zero ty)))
 
+;; Special case for `f16` zero immediates
+(rule 2 (imm ty @ $F16 (u64_zero)) (xmm_zero ty))
+
 ;; Special case for `f32` zero immediates
 (rule 2 (imm ty @ $F32 (u64_zero)) (xmm_zero ty))
 
@@ -5022,18 +5034,30 @@
 
 ;;;; Casting ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(decl bitcast_xmm_to_gpr (Type Xmm) Gpr)
-(rule (bitcast_xmm_to_gpr $F32 src)
+(decl bitcast_xmm_to_gpr (u8 Xmm) Gpr)
+(rule (bitcast_xmm_to_gpr 16 src)
+      (x64_pextrw src 0))
+(rule (bitcast_xmm_to_gpr 32 src)
       (x64_movd_to_gpr src))
-(rule (bitcast_xmm_to_gpr $F64 src)
+(rule (bitcast_xmm_to_gpr 64 src)
       (x64_movq_to_gpr src))
 
-(decl bitcast_gpr_to_xmm (Type Gpr) Xmm)
-(rule (bitcast_gpr_to_xmm $I32 src)
+(decl bitcast_xmm_to_gprs (Xmm) ValueRegs)
+(rule (bitcast_xmm_to_gprs src)
+      (value_regs (x64_movq_to_gpr src) (x64_movq_to_gpr (x64_pshufd src 0b11101110))))
+
+(decl bitcast_gpr_to_xmm (u8 Gpr) Xmm)
+(rule (bitcast_gpr_to_xmm 16 src)
+      (x64_pinsrw (xmm_uninit_value) src 0))
+(rule (bitcast_gpr_to_xmm 32 src)
       (x64_movd_to_xmm src))
-(rule (bitcast_gpr_to_xmm $I64 src)
+(rule (bitcast_gpr_to_xmm 64 src)
       (x64_movq_to_xmm src))
 
+(decl bitcast_gprs_to_xmm (ValueRegs) Xmm)
+(rule (bitcast_gprs_to_xmm src)
+      (x64_punpcklqdq (x64_movq_to_xmm (value_regs_get_gpr src 0)) (x64_movq_to_xmm (value_regs_get_gpr src 1))))
+
 ;;;; Stack Addresses ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (decl stack_addr_impl (StackSlot Offset32) Gpr)
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index dfc8e4fe7190..4015617093ad 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1428,10 +1428,11 @@ pub(crate) fn emit(
             let op = match *ty {
                 types::F64 => SseOpcode::Movsd,
                 types::F32 => SseOpcode::Movsd,
+                types::F16 => SseOpcode::Movsd,
                 types::F32X4 => SseOpcode::Movaps,
                 types::F64X2 => SseOpcode::Movapd,
                 ty => {
-                    debug_assert!(ty.is_vector() && ty.bytes() == 16);
+                    debug_assert!((ty.is_float() || ty.is_vector()) && ty.bytes() == 16);
                     SseOpcode::Movdqa
                 }
             };
diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index a8f55c483552..7ce254c50ff2 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -630,11 +630,12 @@ impl Inst {
             }
             RegClass::Float => {
                 let opcode = match ty {
+                    types::F16 => panic!("loading a f16 requires multiple instructions"),
                     types::F32 => SseOpcode::Movss,
                     types::F64 => SseOpcode::Movsd,
                     types::F32X4 => SseOpcode::Movups,
                     types::F64X2 => SseOpcode::Movupd,
-                    _ if ty.is_vector() && ty.bits() == 128 => SseOpcode::Movdqu,
+                    _ if (ty.is_float() || ty.is_vector()) && ty.bits() == 128 => SseOpcode::Movdqu,
                     _ => unimplemented!("unable to load type: {}", ty),
                 };
                 Inst::xmm_unary_rm_r(opcode, RegMem::mem(from_addr), to_reg)
@@ -650,11 +651,12 @@ impl Inst {
             RegClass::Int => Inst::mov_r_m(OperandSize::from_ty(ty), from_reg, to_addr),
             RegClass::Float => {
                 let opcode = match ty {
+                    types::F16 => panic!("storing a f16 requires multiple instructions"),
                     types::F32 => SseOpcode::Movss,
                     types::F64 => SseOpcode::Movsd,
                     types::F32X4 => SseOpcode::Movups,
                     types::F64X2 => SseOpcode::Movupd,
-                    _ if ty.is_vector() && ty.bits() == 128 => SseOpcode::Movdqu,
+                    _ if (ty.is_float() || ty.is_vector()) && ty.bits() == 128 => SseOpcode::Movdqu,
                     _ => unimplemented!("unable to store type: {}", ty),
                 };
                 Inst::xmm_mov_r_m(opcode, from_reg, to_addr)
@@ -1621,6 +1623,7 @@ impl PrettyPrint for Inst {
                 let suffix = match *ty {
                     types::F64 => "sd",
                     types::F32 => "ss",
+                    types::F16 => "ss",
                     types::F32X4 => "aps",
                     types::F64X2 => "apd",
                     _ => "dqa",
@@ -2605,9 +2608,9 @@ impl MachInst for Inst {
                 // those, which may write more lanes that we need, but are specified to have
                 // zero-latency.
                 let opcode = match ty {
-                    types::F32 | types::F64 | types::F32X4 => SseOpcode::Movaps,
+                    types::F16 | types::F32 | types::F64 | types::F32X4 => SseOpcode::Movaps,
                     types::F64X2 => SseOpcode::Movapd,
-                    _ if ty.is_vector() && ty.bits() == 128 => SseOpcode::Movdqa,
+                    _ if (ty.is_float() || ty.is_vector()) && ty.bits() == 128 => SseOpcode::Movdqa,
                     _ => unimplemented!("unable to move type: {}", ty),
                 };
                 Inst::xmm_unary_rm_r(opcode, RegMem::reg(src_reg), dst_reg)
@@ -2628,8 +2631,10 @@ impl MachInst for Inst {
             types::I64 => Ok((&[RegClass::Int], &[types::I64])),
             types::R32 => panic!("32-bit reftype pointer should never be seen on x86-64"),
             types::R64 => Ok((&[RegClass::Int], &[types::R64])),
+            types::F16 => Ok((&[RegClass::Float], &[types::F16])),
             types::F32 => Ok((&[RegClass::Float], &[types::F32])),
             types::F64 => Ok((&[RegClass::Float], &[types::F64])),
+            types::F128 => Ok((&[RegClass::Float], &[types::F128])),
             types::I128 => Ok((&[RegClass::Int, RegClass::Int], &[types::I64, types::I64])),
             _ if ty.is_vector() => {
                 assert!(ty.bits() <= 128);
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index b814f9d23ef1..7410a9dd7b41 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -22,6 +22,11 @@
       (value_regs (imm $I64 x)
                   (imm $I64 0)))
 
+;;;; Rules for `f16const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (f16const (u16_from_ieee16 x)))
+      (imm $F16 x))
+
 ;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (f32const (u32_from_ieee32 x)))
@@ -32,6 +37,14 @@
 (rule (lower (f64const (u64_from_ieee64 x)))
       (imm $F64 x))
 
+;;;; Rules for `f128const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (f128const const))
+      ;; TODO use Inst::gen_constant() instead.
+      (x64_xmm_load_const $F128 (const_to_vconst const)))
+
+(rule 1 (lower (f128const (u128_from_constant 0)))
+      (xmm_zero $F128))
+
 ;;;; Rules for `null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type ty (null)))
@@ -1585,7 +1598,7 @@
                            lane1
                            (u8_from_uimm8 1)))
         (if-let $true (use_sse41))
-        (x64_pinsrq (bitcast_gpr_to_xmm $I64 lane0) lane1 1))
+        (x64_pinsrq (bitcast_gpr_to_xmm 64 lane0) lane1 1))
 
 (rule 1 (lower (insertlane vec @ (value_type $F32X4) (sinkable_load val) (u8_from_uimm8 idx)))
   (if-let $true (use_sse41))
@@ -2926,10 +2939,14 @@
 ;; For `$F32` and `$F64` this is important--we only want to load 32 or 64 bits.
 ;; But for the 128-bit types, this is not strictly necessary for performance but
 ;; might help with clarity during disassembly.
+(rule (lower (has_type $F16 (load flags address offset)))
+      (x64_pinsrw (xmm_uninit_value) (to_amode flags address offset) 0))
 (rule (lower (has_type $F32 (load flags address offset)))
       (x64_movss_load (to_amode flags address offset)))
 (rule (lower (has_type $F64 (load flags address offset)))
       (x64_movsd_load (to_amode flags address offset)))
+(rule (lower (has_type $F128 (load flags address offset)))
+      (x64_movdqu_load (to_amode flags address offset)))
 (rule (lower (has_type $F32X4 (load flags address offset)))
       (x64_movups_load (to_amode flags address offset)))
 (rule (lower (has_type $F64X2 (load flags address offset)))
@@ -3007,6 +3024,22 @@
       (side_effect
        (x64_movimm_m ty (to_amode flags address offset) imm)))
 
+;; F16 stores of values in XMM registers.
+(rule 0 (lower (store flags
+                    value @ (value_type $F16)
+                    address
+                    offset))
+      (side_effect
+       (x64_movrm $I16 (to_amode flags address offset) (bitcast_xmm_to_gpr 16 value))))
+
+(rule 1 (lower (store flags
+                    value @ (value_type $F16)
+                    address
+                    offset))
+      (if-let $true (use_sse41))
+      (side_effect
+       (x64_pextrw_store (to_amode flags address offset) value 0)))
+
 ;; F32 stores of values in XMM registers.
 (rule 1 (lower (store flags
                     value @ (value_type $F32)
@@ -3023,6 +3056,14 @@
       (side_effect
        (x64_movsd_store (to_amode flags address offset) value)))
 
+;; F128 stores of values in XMM registers.
+(rule 1 (lower (store flags
+                    value @ (value_type $F128)
+                    address
+                    offset))
+      (side_effect
+       (x64_movdqu_store (to_amode flags address offset) value)))
+
 ;; Stores of F32X4 vectors.
 (rule 1 (lower (store flags
                     value @ (value_type $F32X4)
@@ -4013,17 +4054,17 @@
 
 ;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule (lower (has_type $I32 (bitcast _ src @ (value_type $F32))))
-      (bitcast_xmm_to_gpr $F32 src))
+(rule -3 (lower (has_type (is_gpr_type (fits_in_64 ty)) (bitcast _ src @ (value_type (is_xmm_type _)))))
+      (bitcast_xmm_to_gpr (ty_bits ty) src))
 
-(rule (lower (has_type $F32 (bitcast _ src @ (value_type $I32))))
-      (bitcast_gpr_to_xmm $I32 src))
+(rule -2 (lower (has_type (is_xmm_type (fits_in_64 ty)) (bitcast _ src @ (value_type (is_gpr_type _)))))
+      (bitcast_gpr_to_xmm (ty_bits ty) src))
 
-(rule (lower (has_type $I64 (bitcast _ src @ (value_type $F64))))
-      (bitcast_xmm_to_gpr $F64 src))
+(rule -1 (lower (has_type $I128 (bitcast _ src @ (value_type (is_xmm_type _)))))
+      (bitcast_xmm_to_gprs src))
 
-(rule (lower (has_type $F64 (bitcast _ src @ (value_type $I64))))
-      (bitcast_gpr_to_xmm $I64 src))
+(rule 0 (lower (has_type (is_xmm_type _) (bitcast _ src @ (value_type $I128))))
+      (bitcast_gprs_to_xmm src))
 
 ;; Bitcast between types residing in GPR registers is a no-op.
 (rule 1 (lower (has_type (is_gpr_type _)
@@ -4554,7 +4595,7 @@
 ;; Case 2: when moving a scalar value of any other type, use MOVD to zero
 ;; the upper lanes.
 (rule (lower (scalar_to_vector src @ (value_type ty)))
-      (bitcast_gpr_to_xmm ty src))
+      (bitcast_gpr_to_xmm (ty_bits ty) src))
 
 ;; Case 3: when presented with `load + scalar_to_vector`, coalesce into a single
 ;; MOVSS/MOVSD instruction.
@@ -4581,10 +4622,10 @@
           (x64_pshufd (x64_pshuflw (x64_punpcklbw src src) 0) 0)))
 (rule 1 (lower (has_type $I8X16 (splat src)))
         (if-let $true (use_ssse3))
-        (x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16)))
+        (x64_pshufb (bitcast_gpr_to_xmm 32 src) (xmm_zero $I8X16)))
 (rule 2 (lower (has_type $I8X16 (splat src)))
         (if-let $true (use_avx2))
-        (x64_vpbroadcastb (bitcast_gpr_to_xmm $I32 src)))
+        (x64_vpbroadcastb (bitcast_gpr_to_xmm 32 src)))
 (rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
         (if-let $true (use_sse41))
         (if-let $true (use_ssse3))
@@ -4599,10 +4640,10 @@
 ;; at that point is two of the 16-bit values we want to broadcast) to all the
 ;; lanes.
 (rule 0 (lower (has_type $I16X8 (splat src)))
-        (x64_pshufd (x64_pshuflw (bitcast_gpr_to_xmm $I32 src) 0) 0))
+        (x64_pshufd (x64_pshuflw (bitcast_gpr_to_xmm 32 src) 0) 0))
 (rule 1 (lower (has_type $I16X8 (splat src)))
         (if-let $true (use_avx2))
-        (x64_vpbroadcastw (bitcast_gpr_to_xmm $I32 src)))
+        (x64_vpbroadcastw (bitcast_gpr_to_xmm 32 src)))
 (rule 2 (lower (has_type $I16X8 (splat (sinkable_load_exact addr))))
         (x64_pshufd (x64_pshuflw (x64_pinsrw (xmm_uninit_value) addr 0) 0) 0))
 (rule 3 (lower (has_type $I16X8 (splat (sinkable_load_exact addr))))
@@ -4614,10 +4655,10 @@
 ;;
 ;; Note that sinkable-load cases come later
 (rule 0 (lower (has_type $I32X4 (splat src)))
-        (x64_pshufd (bitcast_gpr_to_xmm $I32 src) 0))
+        (x64_pshufd (bitcast_gpr_to_xmm 32 src) 0))
 (rule 1 (lower (has_type $I32X4 (splat src)))
         (if-let $true (use_avx2))
-        (x64_vpbroadcastd (bitcast_gpr_to_xmm $I32 src)))
+        (x64_vpbroadcastd (bitcast_gpr_to_xmm 32 src)))
 
 ;; f32x4.splat - the source is already in an xmm register so `shufps` is all
 ;; that's necessary to complete the splat. This is specialized to `vbroadcastss`
@@ -4649,7 +4690,7 @@
 ;; lane. A minor specialization for sinkable loads to avoid going through a gpr
 ;; for i64 splats is used as well when `movddup` is available.
 (rule 0 (lower (has_type $I64X2 (splat src)))
-        (x64_pshufd (bitcast_gpr_to_xmm $I64 src) 0b01_00_01_00))
+        (x64_pshufd (bitcast_gpr_to_xmm 64 src) 0b01_00_01_00))
 (rule 0 (lower (has_type $F64X2 (splat src)))
         (x64_pshufd src 0b01_00_01_00))
 (rule 6 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr))))
diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs
index c79e2ae5152f..a79016375a22 100644
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -599,7 +599,7 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
             Some(RegisterClass::Gpr {
                 single_register: ty != I128,
             })
-        } else if ty == F32 || ty == F64 || (ty.is_vector() && ty.bits() == 128) {
+        } else if ty.is_float() || (ty.is_vector() && ty.bits() == 128) {
             Some(RegisterClass::Xmm)
         } else {
             None
diff --git a/cranelift/codegen/src/isle_prelude.rs b/cranelift/codegen/src/isle_prelude.rs
index 64c343ea61bc..8798e05c175d 100644
--- a/cranelift/codegen/src/isle_prelude.rs
+++ b/cranelift/codegen/src/isle_prelude.rs
@@ -436,18 +436,19 @@ macro_rules! isle_common_prelude_methods {
 
         #[inline]
         fn ty_scalar_float(&mut self, ty: Type) -> Option<Type> {
-            match ty {
-                F32 | F64 => Some(ty),
-                _ => None,
+            if ty.is_float() {
+                Some(ty)
+            } else {
+                None
             }
         }
 
         #[inline]
         fn ty_float_or_vec(&mut self, ty: Type) -> Option<Type> {
-            match ty {
-                F32 | F64 => Some(ty),
-                ty if ty.is_vector() => Some(ty),
-                _ => None,
+            if ty.is_float() || ty.is_vector() {
+                Some(ty)
+            } else {
+                None
             }
         }
 
@@ -600,6 +601,10 @@ macro_rules! isle_common_prelude_methods {
             }
         }
 
+        fn u16_from_ieee16(&mut self, val: Ieee16) -> u16 {
+            val.bits()
+        }
+
         fn u32_from_ieee32(&mut self, val: Ieee32) -> u32 {
             val.bits()
         }
diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle
index 2e7b3ee8775c..54d8428230c6 100644
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -630,7 +630,11 @@
 (decl pure imm64_masked (Type u64) Imm64)
 (extern constructor imm64_masked imm64_masked)
 
-;; Extract a `u64` from an `Ieee32`.
+;; Extract a `u16` from an `Ieee16`.
+(decl u16_from_ieee16 (u16) Ieee16)
+(extern extractor infallible u16_from_ieee16 u16_from_ieee16)
+
+;; Extract a `u32` from an `Ieee32`.
 (decl u32_from_ieee32 (u32) Ieee32)
 (extern extractor infallible u32_from_ieee32 u32_from_ieee32)
 
diff --git a/cranelift/filetests/filetests/isa/x64/bitcast.clif b/cranelift/filetests/filetests/isa/x64/bitcast.clif
index aee59d0171b4..ef1a2f3fca21 100644
--- a/cranelift/filetests/filetests/isa/x64/bitcast.clif
+++ b/cranelift/filetests/filetests/isa/x64/bitcast.clif
@@ -1,7 +1,59 @@
 test compile precise-output
+set enable_llvm_abi_extensions
 target x86_64
 
-function %f1(f32) -> i32 {
+function %bitcast_f16_to_i16(f16) -> i16 {
+block0(v0: f16):
+  v1 = bitcast.i16 v0
+  return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pextrw  $0, %xmm0, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pextrw $0, %xmm0, %eax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %bitcast_i16_to_f16(i16) -> f16 {
+block0(v0: i16):
+  v1 = bitcast.f16 v0
+  return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   uninit  %xmm0
+;   pinsrw  $0, %xmm0, %rdi, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pinsrw $0, %edi, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %bitcast_f32_to_i32(f32) -> i32 {
 block0(v0: f32):
   v1 = bitcast.i32 v0
   return v1
@@ -26,7 +78,7 @@ block0(v0: f32):
 ;   popq %rbp
 ;   retq
 
-function %f2(i32) -> f32 {
+function %bitcast_i32_to_f32(i32) -> f32 {
 block0(v0: i32):
   v1 = bitcast.f32 v0
   return v1
@@ -51,7 +103,7 @@ block0(v0: i32):
 ;   popq %rbp
 ;   retq
 
-function %f3(f64) -> i64 {
+function %bitcast_f64_to_i64(f64) -> i64 {
 block0(v0: f64):
   v1 = bitcast.i64 v0
   return v1
@@ -76,7 +128,7 @@ block0(v0: f64):
 ;   popq %rbp
 ;   retq
 
-function %f4(i64) -> f64 {
+function %bitcast_i64_to_f64(i64) -> f64 {
 block0(v0: i64):
   v1 = bitcast.f64 v0
   return v1
@@ -101,3 +153,119 @@ block0(v0: i64):
 ;   popq %rbp
 ;   retq
 
+function %bitcast_f128_to_i128(f128) -> i128 {
+block0(v0: f128):
+  v1 = bitcast.i128 v0
+  return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %xmm0, %rax
+;   pshufd  $238, %xmm0, %xmm4
+;   movq    %xmm4, %rdx
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %xmm0, %rax
+;   pshufd $0xee, %xmm0, %xmm4
+;   movq %xmm4, %rdx
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %bitcast_i128_to_f128(i128) -> f128 {
+block0(v0: i128):
+  v1 = bitcast.f128 v0
+  return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %xmm0
+;   movq    %rsi, %xmm5
+;   punpcklqdq %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %xmm0
+;   movq %rsi, %xmm5
+;   punpcklqdq %xmm5, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %bitcast_i64x2_to_i128(i64x2) -> i128 {
+block0(v0: i64x2):
+  v1 = bitcast.i128 little v0
+  return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %xmm0, %rax
+;   pshufd  $238, %xmm0, %xmm4
+;   movq    %xmm4, %rdx
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %xmm0, %rax
+;   pshufd $0xee, %xmm0, %xmm4
+;   movq %xmm4, %rdx
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %bitcast_i128_to_i64x2(i128) -> i64x2 {
+block0(v0: i128):
+  v1 = bitcast.i64x2 little v0
+  return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %xmm0
+;   movq    %rsi, %xmm5
+;   punpcklqdq %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movq %rdi, %xmm0
+;   movq %rsi, %xmm5
+;   punpcklqdq %xmm5, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/isa/x64/call-conv.clif b/cranelift/filetests/filetests/isa/x64/call-conv.clif
index be52290ec41f..a3b06408892d 100644
--- a/cranelift/filetests/filetests/isa/x64/call-conv.clif
+++ b/cranelift/filetests/filetests/isa/x64/call-conv.clif
@@ -1,4 +1,5 @@
 test compile precise-output
+set enable_llvm_abi_extensions
 target x86_64
 
 function %one_arg(i32) system_v {
@@ -594,3 +595,98 @@ block0(v0: i32, v1: i8x16):
 ;   popq %rbp
 ;   retq
 
+function %second_f16(f16, f16) -> f16 system_v {
+block0(v0: f16, v1: f16):
+    return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movdqa %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %second_f128(f128, f128) -> f128 system_v {
+block0(v0: f128, v1: f128):
+    return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movdqa %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %second_f16_fastcall(f16, f16) -> f16 windows_fastcall {
+block0(v0: f16, v1: f16):
+    return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movdqa %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %second_f128(f128, f128) -> f128 windows_fastcall {
+block0(v0: f128, v1: f128):
+    return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movdqa %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
diff --git a/cranelift/filetests/filetests/isa/x64/f128const.clif b/cranelift/filetests/filetests/isa/x64/f128const.clif
new file mode 100644
index 000000000000..22ee804d6d6f
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/f128const.clif
@@ -0,0 +1,69 @@
+test compile precise-output
+target x86_64
+
+function %ret_0() -> f128 {
+block0():
+  v0 = f128const 0.0
+  return v0
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   uninit  %xmm0
+;   xorps   %xmm0, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   xorps %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %ret_1() -> f128 {
+block0():
+  v0 = f128const 0x1.0
+  return v0
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqu  const(0), %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movdqu 0x14(%rip), %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %bh, %bh
+
diff --git a/cranelift/filetests/filetests/isa/x64/f16const.clif b/cranelift/filetests/filetests/isa/x64/f16const.clif
new file mode 100644
index 000000000000..edcaf2ac29fd
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/f16const.clif
@@ -0,0 +1,57 @@
+test compile precise-output
+target x86_64
+
+function %ret_0() -> f16 {
+block0():
+  v0 = f16const 0.0
+  return v0
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   uninit  %xmm0
+;   xorps   %xmm0, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   xorps %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %ret_1() -> f16 {
+block0():
+  v0 = f16const 0x1.0
+  return v0
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movl    $15360, %esi
+;   uninit  %xmm0
+;   pinsrw  $0, %xmm0, %rsi, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movl $0x3c00, %esi
+;   pinsrw $0, %esi, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/isa/x64/load-f16-f128.clif b/cranelift/filetests/filetests/isa/x64/load-f16-f128.clif
new file mode 100644
index 000000000000..8bd94ae70709
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/load-f16-f128.clif
@@ -0,0 +1,54 @@
+test compile precise-output
+target x86_64
+
+function %load_f16(i64) -> f16 {
+block0(v0: i64):
+    v1 = load.f16 v0
+    return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   uninit  %xmm0
+;   pinsrw  $0, %xmm0, 0(%rdi), %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pinsrw $0, (%rdi), %xmm0 ; trap: heap_oob
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %load_f128(i64) -> f128 {
+block0(v0: i64):
+    v1 = load.f128 v0
+    return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqu  0(%rdi), %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movdqu (%rdi), %xmm0 ; trap: heap_oob
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/isa/x64/select.clif b/cranelift/filetests/filetests/isa/x64/select.clif
index 4951bce58d10..d25799b74138 100644
--- a/cranelift/filetests/filetests/isa/x64/select.clif
+++ b/cranelift/filetests/filetests/isa/x64/select.clif
@@ -65,3 +65,131 @@ block0(v0: f32, v1: f32, v2: i64, v3: i64):
 ;   popq %rbp
 ;   retq
 
+function %select_f16(i8, f16, f16) -> f16 {
+block0(v0: i8, v1: f16, v2: f16):
+    v3 = select.f16 v0, v1, v2
+    return v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   testb   %dil, %dil
+;   movdqa  %xmm0, %xmm6
+;   movdqa  %xmm1, %xmm0
+;   movss %xmm0, %xmm0; jz $next; movss %xmm6, %xmm0; $next:
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   testb %dil, %dil
+;   movdqa %xmm0, %xmm6
+;   movdqa %xmm1, %xmm0
+;   je 0x19
+;   movsd %xmm6, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %select_f32(i8, f32, f32) -> f32 {
+block0(v0: i8, v1: f32, v2: f32):
+    v3 = select.f32 v0, v1, v2
+    return v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   testb   %dil, %dil
+;   movdqa  %xmm0, %xmm6
+;   movdqa  %xmm1, %xmm0
+;   movss %xmm0, %xmm0; jz $next; movss %xmm6, %xmm0; $next:
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   testb %dil, %dil
+;   movdqa %xmm0, %xmm6
+;   movdqa %xmm1, %xmm0
+;   je 0x19
+;   movsd %xmm6, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %select_f64(i8, f64, f64) -> f64 {
+block0(v0: i8, v1: f64, v2: f64):
+    v3 = select.f64 v0, v1, v2
+    return v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   testb   %dil, %dil
+;   movdqa  %xmm0, %xmm6
+;   movdqa  %xmm1, %xmm0
+;   movsd %xmm0, %xmm0; jz $next; movsd %xmm6, %xmm0; $next:
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   testb %dil, %dil
+;   movdqa %xmm0, %xmm6
+;   movdqa %xmm1, %xmm0
+;   je 0x19
+;   movsd %xmm6, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %select_f128(i8, f128, f128) -> f128 {
+block0(v0: i8, v1: f128, v2: f128):
+    v3 = select.f128 v0, v1, v2
+    return v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   testb   %dil, %dil
+;   movdqa  %xmm0, %xmm6
+;   movdqa  %xmm1, %xmm0
+;   movdqa %xmm0, %xmm0; jz $next; movdqa %xmm6, %xmm0; $next:
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   testb %dil, %dil
+;   movdqa %xmm0, %xmm6
+;   movdqa %xmm1, %xmm0
+;   je 0x19
+;   movdqa %xmm6, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/isa/x64/store-f16-f128.clif b/cranelift/filetests/filetests/isa/x64/store-f16-f128.clif
new file mode 100644
index 000000000000..729d271c7913
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/store-f16-f128.clif
@@ -0,0 +1,55 @@
+test compile precise-output
+target x86_64
+
+function %store_f16(f16, i64) {
+block0(v0: f16, v1: i64):
+    store.f16 v0, v1
+    return
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pextrw  $0, %xmm0, %rcx
+;   movw    %cx, 0(%rdi)
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pextrw $0, %xmm0, %ecx
+;   movw %cx, (%rdi) ; trap: heap_oob
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %store_f128(f128, i64) {
+block0(v0: f128, v1: i64):
+    store.f128 v0, v1
+    return
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqu  %xmm0, 0(%rdi)
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movdqu %xmm0, (%rdi) ; trap: heap_oob
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/isa/x64/store-f16-sse41.clif b/cranelift/filetests/filetests/isa/x64/store-f16-sse41.clif
new file mode 100644
index 000000000000..bb89bf07354a
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/store-f16-sse41.clif
@@ -0,0 +1,28 @@
+test compile precise-output
+target x86_64 sse41
+
+function %store_f16(f16, i64) {
+block0(v0: f16, v1: i64):
+    store.f16 v0, v1
+    return
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pextrw  $0, %xmm0, 0(%rdi)
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pextrw $0, %xmm0, (%rdi) ; trap: heap_oob
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/runtests/bitcast-f16-f128.clif b/cranelift/filetests/filetests/runtests/bitcast-f16-f128.clif
index 5448ae48134f..d130c3e75e0d 100644
--- a/cranelift/filetests/filetests/runtests/bitcast-f16-f128.clif
+++ b/cranelift/filetests/filetests/runtests/bitcast-f16-f128.clif
@@ -1,4 +1,7 @@
 test interpret
+test run
+set enable_llvm_abi_extensions
+target x86_64
 
 function %bitcast_i16_f16(i16) -> f16 fast {
 block0(v0: i16):
diff --git a/cranelift/filetests/filetests/runtests/f128const.clif b/cranelift/filetests/filetests/runtests/f128const.clif
index d670a2a67871..eb77aa245488 100644
--- a/cranelift/filetests/filetests/runtests/f128const.clif
+++ b/cranelift/filetests/filetests/runtests/f128const.clif
@@ -1,4 +1,7 @@
 test interpret
+test run
+set enable_llvm_abi_extensions
+target x86_64
 
 
 ;; These values are special for RISC-V since it has a dedicated
@@ -51,3 +54,11 @@ block0:
 }
 
 ; run: %f128const_neg_nan() == -NaN
+
+function %f128const_zero() -> f128 {
+block0:
+    v0 = f128const 0.0
+    return v0
+}
+
+; run: %f128const_zero() == 0.0
diff --git a/cranelift/filetests/filetests/runtests/f16const.clif b/cranelift/filetests/filetests/runtests/f16const.clif
index 99507b888548..16797f2fa892 100644
--- a/cranelift/filetests/filetests/runtests/f16const.clif
+++ b/cranelift/filetests/filetests/runtests/f16const.clif
@@ -1,4 +1,7 @@
 test interpret
+test run
+set enable_llvm_abi_extensions
+target x86_64
 
 
 ;; These values are special for RISC-V since it has a dedicated
@@ -51,3 +54,11 @@ block0:
 }
 
 ; run: %f16const_neg_nan() == -NaN
+
+function %f16const_zero() -> f16 {
+block0:
+    v0 = f16const 0.0
+    return v0
+}
+
+; run: %f16const_zero() == 0.0
diff --git a/cranelift/filetests/filetests/runtests/select-f16-f128.clif b/cranelift/filetests/filetests/runtests/select-f16-f128.clif
new file mode 100644
index 000000000000..518d6efa37cf
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/select-f16-f128.clif
@@ -0,0 +1,139 @@
+test interpret
+test run
+set enable_llvm_abi_extensions
+target x86_64
+
+function %select_icmp_i8_f16(i8, f16, f16) -> f16 {
+block0(v0: i8, v1: f16, v2: f16):
+  v3 = iconst.i8 42
+  v4 = icmp eq v0, v3
+  v5 = select.f16 v4, v1, v2
+  return v5
+}
+; run: %select_icmp_i8_f16(42, 0x0.0, 0x1.0) == 0x0.0
+; run: %select_icmp_i8_f16(10, 0x0.0, 0x1.0) == 0x1.0
+; run: %select_icmp_i8_f16(42, +Inf, -Inf) == +Inf
+; run: %select_icmp_i8_f16(10, +Inf, -Inf) == -Inf
+; run: %select_icmp_i8_f16(42, +NaN, -NaN) == +NaN
+; run: %select_icmp_i8_f16(10, +NaN, -NaN) == -NaN
+; run: %select_icmp_i8_f16(42, 0x0.800p-14, -0x0.800p-14) == 0x0.800p-14
+; run: %select_icmp_i8_f16(10, 0x0.800p-14, -0x0.800p-14) == -0x0.800p-14
+
+
+function %select_icmp_i8_f128(i8, f128, f128) -> f128 {
+block0(v0: i8, v1: f128, v2: f128):
+  v3 = iconst.i8 42
+  v4 = icmp eq v0, v3
+  v5 = select.f128 v4, v1, v2
+  return v5
+}
+; run: %select_icmp_i8_f128(42, 0x0.0, 0x1.0) == 0x0.0
+; run: %select_icmp_i8_f128(10, 0x0.0, 0x1.0) == 0x1.0
+; run: %select_icmp_i8_f128(42, +Inf, -Inf) == +Inf
+; run: %select_icmp_i8_f128(10, +Inf, -Inf) == -Inf
+; run: %select_icmp_i8_f128(42, +NaN, -NaN) == +NaN
+; run: %select_icmp_i8_f128(10, +NaN, -NaN) == -NaN
+; run: %select_icmp_i8_f128(42, 0x0.8000000000000000000000000000p-16382, -0x0.8000000000000000000000000000p-16382) == 0x0.8000000000000000000000000000p-16382
+; run: %select_icmp_i8_f128(10, 0x0.8000000000000000000000000000p-16382, -0x0.8000000000000000000000000000p-16382) == -0x0.8000000000000000000000000000p-16382
+
+
+function %select_icmp_i16_f16(i16, f16, f16) -> f16 {
+block0(v0: i16, v1: f16, v2: f16):
+  v3 = iconst.i16 42
+  v4 = icmp eq v0, v3
+  v5 = select.f16 v4, v1, v2
+  return v5
+}
+; run: %select_icmp_i16_f16(42, 0x0.0, 0x1.0) == 0x0.0
+; run: %select_icmp_i16_f16(10, 0x0.0, 0x1.0) == 0x1.0
+; run: %select_icmp_i16_f16(42, +Inf, -Inf) == +Inf
+; run: %select_icmp_i16_f16(10, +Inf, -Inf) == -Inf
+; run: %select_icmp_i16_f16(42, +NaN, -NaN) == +NaN
+; run: %select_icmp_i16_f16(10, +NaN, -NaN) == -NaN
+; run: %select_icmp_i16_f16(42, 0x0.800p-14, -0x0.800p-14) == 0x0.800p-14
+; run: %select_icmp_i16_f16(10, 0x0.800p-14, -0x0.800p-14) == -0x0.800p-14
+
+
+function %select_icmp_i16_f128(i16, f128, f128) -> f128 {
+block0(v0: i16, v1: f128, v2: f128):
+  v3 = iconst.i16 42
+  v4 = icmp eq v0, v3
+  v5 = select.f128 v4, v1, v2
+  return v5
+}
+; run: %select_icmp_i16_f128(42, 0x0.0, 0x1.0) == 0x0.0
+; run: %select_icmp_i16_f128(10, 0x0.0, 0x1.0) == 0x1.0
+; run: %select_icmp_i16_f128(42, +Inf, -Inf) == +Inf
+; run: %select_icmp_i16_f128(10, +Inf, -Inf) == -Inf
+; run: %select_icmp_i16_f128(42, +NaN, -NaN) == +NaN
+; run: %select_icmp_i16_f128(10, +NaN, -NaN) == -NaN
+; run: %select_icmp_i16_f128(42, 0x0.8000000000000000000000000000p-16382, -0x0.8000000000000000000000000000p-16382) == 0x0.8000000000000000000000000000p-16382
+; run: %select_icmp_i16_f128(10, 0x0.8000000000000000000000000000p-16382, -0x0.8000000000000000000000000000p-16382) == -0x0.8000000000000000000000000000p-16382
+
+
+function %select_icmp_i32_f16(i32, f16, f16) -> f16 {
+block0(v0: i32, v1: f16, v2: f16):
+  v3 = iconst.i32 42
+  v4 = icmp eq v0, v3
+  v5 = select.f16 v4, v1, v2
+  return v5
+}
+; run: %select_icmp_i32_f16(42, 0x0.0, 0x1.0) == 0x0.0
+; run: %select_icmp_i32_f16(10, 0x0.0, 0x1.0) == 0x1.0
+; run: %select_icmp_i32_f16(42, +Inf, -Inf) == +Inf
+; run: %select_icmp_i32_f16(10, +Inf, -Inf) == -Inf
+; run: %select_icmp_i32_f16(42, +NaN, -NaN) == +NaN
+; run: %select_icmp_i32_f16(10, +NaN, -NaN) == -NaN
+; run: %select_icmp_i32_f16(42, 0x0.800p-14, -0x0.800p-14) == 0x0.800p-14
+; run: %select_icmp_i32_f16(10, 0x0.800p-14, -0x0.800p-14) == -0x0.800p-14
+
+
+function %select_icmp_i32_f128(i32, f128, f128) -> f128 {
+block0(v0: i32, v1: f128, v2: f128):
+  v3 = iconst.i32 42
+  v4 = icmp eq v0, v3
+  v5 = select.f128 v4, v1, v2
+  return v5
+}
+; run: %select_icmp_i32_f128(42, 0x0.0, 0x1.0) == 0x0.0
+; run: %select_icmp_i32_f128(10, 0x0.0, 0x1.0) == 0x1.0
+; run: %select_icmp_i32_f128(42, +Inf, -Inf) == +Inf
+; run: %select_icmp_i32_f128(10, +Inf, -Inf) == -Inf
+; run: %select_icmp_i32_f128(42, +NaN, -NaN) == +NaN
+; run: %select_icmp_i32_f128(10, +NaN, -NaN) == -NaN
+; run: %select_icmp_i32_f128(42, 0x0.8000000000000000000000000000p-16382, -0x0.8000000000000000000000000000p-16382) == 0x0.8000000000000000000000000000p-16382
+; run: %select_icmp_i32_f128(10, 0x0.8000000000000000000000000000p-16382, -0x0.8000000000000000000000000000p-16382) == -0x0.8000000000000000000000000000p-16382
+
+
+function %select_icmp_i64_f16(i64, f16, f16) -> f16 {
+block0(v0: i64, v1: f16, v2: f16):
+  v3 = iconst.i64 42
+  v4 = icmp eq v0, v3
+  v5 = select.f16 v4, v1, v2
+  return v5
+}
+; run: %select_icmp_i64_f16(42, 0x0.0, 0x1.0) == 0x0.0
+; run: %select_icmp_i64_f16(10, 0x0.0, 0x1.0) == 0x1.0
+; run: %select_icmp_i64_f16(42, +Inf, -Inf) == +Inf
+; run: %select_icmp_i64_f16(10, +Inf, -Inf) == -Inf
+; run: %select_icmp_i64_f16(42, +NaN, -NaN) == +NaN
+; run: %select_icmp_i64_f16(10, +NaN, -NaN) == -NaN
+; run: %select_icmp_i64_f16(42, 0x0.800p-14, -0x0.800p-14) == 0x0.800p-14
+; run: %select_icmp_i64_f16(10, 0x0.800p-14, -0x0.800p-14) == -0x0.800p-14
+
+
+function %select_icmp_i64_f128(i64, f128, f128) -> f128 {
+block0(v0: i64, v1: f128, v2: f128):
+  v3 = iconst.i64 42
+  v4 = icmp eq v0, v3
+  v5 = select.f128 v4, v1, v2
+  return v5
+}
+; run: %select_icmp_i64_f128(42, 0x0.0, 0x1.0) == 0x0.0
+; run: %select_icmp_i64_f128(10, 0x0.0, 0x1.0) == 0x1.0
+; run: %select_icmp_i64_f128(42, +Inf, -Inf) == +Inf
+; run: %select_icmp_i64_f128(10, +Inf, -Inf) == -Inf
+; run: %select_icmp_i64_f128(42, +NaN, -NaN) == +NaN
+; run: %select_icmp_i64_f128(10, +NaN, -NaN) == -NaN
+; run: %select_icmp_i64_f128(42, 0x0.8000000000000000000000000000p-16382, -0x0.8000000000000000000000000000p-16382) == 0x0.8000000000000000000000000000p-16382
+; run: %select_icmp_i64_f128(10, 0x0.8000000000000000000000000000p-16382, -0x0.8000000000000000000000000000p-16382) == -0x0.8000000000000000000000000000p-16382