diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index a449144f231..155733ae13e 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -76,3 +76,6 @@ testsuite/flambda2-test-list    @xclerc
 tools/merge_archives.ml         @mshinwell @xclerc
 tools/merge_dot_a_files.sh      @mshinwell @xclerc
 tools/objinfo.ml                @mshinwell @xclerc
+
+testsuite/tests/unboxed-primitive-args  @TheNumbat
+tests/simd                              @TheNumbat
diff --git a/backend/CSEgen.ml b/backend/CSEgen.ml
index 1b0f9fc06cf..be95670c65e 100644
--- a/backend/CSEgen.ml
+++ b/backend/CSEgen.ml
@@ -233,7 +233,7 @@ class cse_generic = object (self)
 method class_of_operation op =
   match op with
   | Imove | Ispill | Ireload -> assert false   (* treated specially *)
-  | Iconst_int _ | Iconst_float _ | Iconst_symbol _ -> Op_pure
+  | Iconst_int _ | Iconst_float _ | Iconst_symbol _ | Iconst_vec128 _ -> Op_pure
   | Icall_ind | Icall_imm _ | Itailcall_ind | Itailcall_imm _
   | Iextcall _ | Iprobe _ | Iopaque -> assert false  (* treated specially *)
   | Istackoffset _ -> Op_other
diff --git a/backend/afl_instrument.ml b/backend/afl_instrument.ml
index e4bd0eea054..93711ba9f46 100644
--- a/backend/afl_instrument.ml
+++ b/backend/afl_instrument.ml
@@ -93,7 +93,7 @@ and instrument = function
   | Ctail e -> Ctail (instrument e)
 
   (* these are base cases and have no logging *)
-  | Cconst_int _ | Cconst_natint _ | Cconst_float _
+  | Cconst_int _ | Cconst_natint _ | Cconst_float _ | Cconst_vec128 _
   | Cconst_symbol _
   | Cvar _ as c -> c
 
diff --git a/backend/amd64/CSE.ml b/backend/amd64/CSE.ml
index 9d5587a19d2..78eb0d38e28 100644
--- a/backend/amd64/CSE.ml
+++ b/backend/amd64/CSE.ml
@@ -45,7 +45,7 @@ method! class_of_operation op =
   | Icompf _
   | Icsel _
   | Ifloatofint | Iintoffloat | Ivalueofint | Iintofvalue
-  | Iconst_int _ | Iconst_float _ | Iconst_symbol _
+  | Iconst_int _ | Iconst_float _ | Iconst_symbol _ | Iconst_vec128 _
   | Icall_ind | Icall_imm _ | Itailcall_ind | Itailcall_imm _ | Iextcall _
   | Istackoffset _ | Iload _ | Istore _ | Ialloc _
   | Iintop _ | Iintop_imm _ | Iintop_atomic _
diff --git a/backend/amd64/arch.ml b/backend/amd64/arch.ml
index 3dcecff2b11..4165d7bf1a4 100644
--- a/backend/amd64/arch.ml
+++ b/backend/amd64/arch.ml
@@ -31,6 +31,9 @@ let prefetchwt1_support = ref false
 (* Emit elf notes with trap handling information. *)
 let trap_notes = ref true
 
+(* Enables usage of vector registers. *)
+let simd_regalloc_support = ref false
+
 (* Machine-specific command-line options *)
 
 let command_line_options =
@@ -59,6 +62,10 @@ let command_line_options =
       " Emit .note.ocaml_eh section with trap handling information (default)";
     "-fno-trap-notes", Arg.Clear trap_notes,
       " Do not emit .note.ocaml_eh section with trap handling information";
+    "-fsimd", Arg.Set simd_regalloc_support,
+      " Enable register allocation for SIMD vectors";
+    "-fno-simd", Arg.Clear simd_regalloc_support,
+      " Disable register allocation for SIMD vectors (default)"
   ]
 
 (* Specific operations for the AMD64 processor *)
@@ -130,6 +137,8 @@ let size_addr = 8
 let size_int = 8
 let size_float = 8
 
+let size_vec128 = 16
+
 let allow_unaligned_access = true
 
 (* Behavior of division *)
diff --git a/backend/amd64/emit.mlp b/backend/amd64/emit.mlp
index 5a7937182c8..58714e7d458 100644
--- a/backend/amd64/emit.mlp
+++ b/backend/amd64/emit.mlp
@@ -47,9 +47,14 @@ let int_reg_name =
 
 let float_reg_name = Array.init 16 (fun i -> XMM i)
 
-let register_name r =
-  if r < 100 then Reg64 (int_reg_name.(r))
-  else Regf (float_reg_name.(r - 100))
+let register_name typ r =
+  match typ with
+  | Int | Val | Addr -> Reg64 (int_reg_name.(r))
+  | Float -> Regf (float_reg_name.(r - 100))
+  | Vec128 ->
+    if not !simd_regalloc_support then
+    Misc.fatal_error "SIMD register allocation is not enabled.";
+    Regf (float_reg_name.(r - 100))
 
 (* CFI directives *)
 
@@ -80,7 +85,7 @@ let fastcode_flag = ref true
 (* Layout of the stack frame *)
 let stack_offset = ref 0
 
-let num_stack_slots = Array.make Proc.num_register_classes 0
+let num_stack_slots = Array.make Proc.num_stack_slot_classes 0
 
 let prologue_required = ref false
 
@@ -88,8 +93,13 @@ let frame_required = ref false
 
 let frame_size () =                     (* includes return address *)
   if !frame_required then begin
+    if not !simd_regalloc_support then assert (num_stack_slots.(2) = 0);
     let sz =
-      (!stack_offset + 8 * (num_stack_slots.(0) + num_stack_slots.(1)) + 8
+      (!stack_offset
+       + 8
+       + 8 * num_stack_slots.(0)
+       + 8 * num_stack_slots.(1)
+       + 16 * num_stack_slots.(2)
        + (if fp then 8 else 0))
     in Misc.align sz 16
   end else
@@ -99,9 +109,14 @@ let slot_offset loc cl =
   match loc with
   | Incoming n -> frame_size() + n
   | Local n ->
-      if cl = 0
-      then !stack_offset + n * 8
-      else !stack_offset + (num_stack_slots.(0) + n) * 8
+      if not !simd_regalloc_support then assert (num_stack_slots.(2) = 0 && cl < 2);
+      (!stack_offset +
+        (* Preserves original ordering (int -> float) *)
+        match cl with
+        | 2 -> n * 16
+        | 0 -> num_stack_slots.(2) * 16 + n * 8
+        | 1 -> num_stack_slots.(2) * 16 + num_stack_slots.(0) * 8 + n * 8
+        | _ -> Misc.fatal_error "Unknown register class")
   | Outgoing n -> n
   | Domainstate _ -> assert false (* not a stack slot *)
 
@@ -274,15 +289,16 @@ let emit_Llabel fallthrough lbl section_name =
 
 let x86_data_type_for_stack_slot = function
   | Float -> REAL8
-  | _ -> QWORD
+  | Vec128 -> VEC128
+  | Int | Addr | Val -> QWORD
 
 let reg = function
-  | { loc = Reg.Reg r } -> register_name r
+  | { loc = Reg.Reg r; typ = ty } -> register_name ty r
   | { loc = Stack (Domainstate n); typ = ty } ->
       let ofs = n + Domainstate.(idx_of_field Domain_extra_params) * 8 in
       mem64 (x86_data_type_for_stack_slot ty) ofs R14
   | { loc = Stack s; typ = ty } as r ->
-      let ofs = slot_offset s (register_class r) in
+      let ofs = slot_offset s (stack_slot_class r.typ) in
       mem64 (x86_data_type_for_stack_slot ty) ofs RSP
   | { loc = Unknown } ->
       assert false
@@ -305,7 +321,7 @@ let reg_low_32_name = Array.map (fun r -> Reg32 r) int_reg_name
 let emit_subreg tbl typ r =
   match r.loc with
   | Reg.Reg r when r < 13 -> tbl.(r)
-  | Stack s -> mem64 typ (slot_offset s (register_class r)) RSP
+  | Stack s -> mem64 typ (slot_offset s (stack_slot_class r.Reg.typ)) RSP
   | _ -> assert false
 
 let arg8 i n = emit_subreg reg_low_8_name BYTE i.arg.(n)
@@ -347,7 +363,7 @@ let record_frame_label live dbg =
       | {typ = Val; loc = Reg r} ->
           live_offset := ((r lsl 1) + 1) :: !live_offset
       | {typ = Val; loc = Stack s} as reg ->
-          live_offset := slot_offset s (register_class reg) :: !live_offset
+          live_offset := slot_offset s (stack_slot_class reg.typ) :: !live_offset
       | {typ = Addr} as r ->
           Misc.fatal_error ("bad GC root " ^ Reg.name r)
       | _ -> ()
@@ -483,7 +499,7 @@ let build_asm_directives () : (module Asm_targets.Asm_directives_intf.S) = (
         include X86_dsl.D
 
         type data_type =
-          | NONE | DWORD | QWORD
+          | NONE | DWORD | QWORD | VEC128
 
         type nonrec constant = constant
         let const_int64 num = Const num
@@ -497,7 +513,8 @@ let build_asm_directives () : (module Asm_targets.Asm_directives_intf.S) = (
               (function
                 | NONE -> X86_ast.NONE
                 | DWORD -> X86_ast.DWORD
-                | QWORD -> X86_ast.QWORD)
+                | QWORD -> X86_ast.QWORD
+                | VEC128 -> X86_ast.VEC128)
               data_type
           in
           label ?typ str
@@ -670,6 +687,24 @@ let emit_float_constant f lbl =
   _label (emit_label lbl);
   D.qword (Const f)
 
+(* Vector constants *)
+
+let vec128_constants = ref ([] : (Cmm.vec128_bits * int) list)
+
+let add_vec128_constant bits =
+  try
+    List.assoc bits !vec128_constants
+  with Not_found ->
+    let lbl = new_label() in
+    vec128_constants := (bits, lbl) :: !vec128_constants;
+    lbl
+
+let emit_vec128_constant {high; low} lbl =
+  _label (emit_label lbl);
+  (* SIMD vectors respect little-endian byte order *)
+  D.qword (Const low);
+  D.qword (Const high)
+
 let emit_global_label_for_symbol lbl =
   add_def_symbol lbl;
   let lbl = emit_symbol lbl in
@@ -683,11 +718,27 @@ let emit_global_label s =
 let move (src : Reg.t) (dst : Reg.t) =
   if src.loc <> dst.loc then
     begin match src.typ, src.loc, dst.typ, dst.loc with
-    | Float, Reg.Reg _, Float, Reg.Reg _ -> I.movapd (reg src) (reg dst)
-    | Float, _, Float, _ -> I.movsd (reg src) (reg dst)
-    | Float, _, Int, _
-    | Int, _, Float, _ -> I.movq (reg src) (reg dst)
-    | _ -> I.mov (reg src) (reg dst)
+    | Float, Reg.Reg _, Float, Reg.Reg _
+    | Vec128, _, Vec128, _ ->
+      (* Vec128 stack slots are always aligned. *)
+      I.movapd (reg src) (reg dst)
+    | Float, _, Float, _ ->
+      I.movsd (reg src) (reg dst)
+    | Float, _, Int, _ | Int, _, Float, _ ->
+      (* CR-soon gyorsh: this case is used by the bits_of_float/float_of_bits intrinsics.
+         They should instead generate a separate Ispecific and this case should be
+         removed. *)
+      I.movq (reg src) (reg dst)
+    | (Int | Val | Addr), _, (Int | Val | Addr), _ ->
+      I.mov (reg src) (reg dst)
+    | Vec128, _, _, _ | _, _, Vec128, _ ->
+      Misc.fatal_errorf
+        "Illegal move between a vector and non-vector register (%s to %s)\n"
+        (Reg.name src) (Reg.name dst)
+    | Float, _, (Val | Addr), _ | (Val | Addr), _, Float, _ ->
+      Misc.fatal_errorf
+        "Illegal move between a float and val/addr register (%s to %s)\n"
+        (Reg.name src) (Reg.name dst)
     end
 
 let stack_to_stack_move (src : Reg.t) (dst : Reg.t) =
@@ -698,7 +749,7 @@ let stack_to_stack_move (src : Reg.t) (dst : Reg.t) =
       (* Not calling move because r15 is not in int_reg_name. *)
       I.mov (reg src) r15;
       I.mov r15 (reg dst)
-    | Float | Addr ->
+    | Float | Addr | Vec128 ->
       Misc.fatal_errorf
         "Unexpected register type for stack to stack move: from %s to %s\n"
         (Reg.name src) (Reg.name dst)
@@ -906,7 +957,15 @@ let emit_instr fallthrough i =
           I.xorpd (res i 0) (res i 0)
       | _ ->
           let lbl = add_float_constant f in
-          I.movsd (mem64_rip NONE (emit_label lbl)) (res i 0)
+          I.movsd (mem64_rip REAL8 (emit_label lbl)) (res i 0)
+      end
+  | Lop(Iconst_vec128 {high; low}) ->
+      begin match (high, low) with
+      | 0x0000_0000_0000_0000L, 0x0000_0000_0000_0000L ->
+          I.xorpd (res i 0) (res i 0)
+      | _ ->
+          let lbl = add_vec128_constant {high; low} in
+          I.movupd (mem64_rip VEC128 (emit_label lbl)) (res i 0)
       end
   | Lop(Iconst_symbol s) ->
       add_used_symbol s.sym_name;
@@ -966,11 +1025,13 @@ let emit_instr fallthrough i =
       | Sixteen_unsigned ->
           I.movzx (addressing addr WORD i 0) dest
       | Sixteen_signed ->
-          I.movsx (addressing addr WORD i 0) dest;
+          I.movsx (addressing addr WORD i 0) dest
       | Thirtytwo_unsigned ->
           I.mov (addressing addr DWORD i 0) (res32 i 0)
       | Thirtytwo_signed ->
           I.movsxd (addressing addr DWORD i 0) dest
+      | Onetwentyeight ->
+          I.movupd (addressing addr VEC128 i 0) dest
       | Single ->
           I.cvtss2sd (addressing addr REAL4 i 0) dest
       | Double ->
@@ -986,6 +1047,8 @@ let emit_instr fallthrough i =
           I.mov (arg16 i 0) (addressing addr WORD i 1)
       | Thirtytwo_signed | Thirtytwo_unsigned ->
           I.mov (arg32 i 0) (addressing addr DWORD i 1)
+      | Onetwentyeight ->
+          I.movupd (arg i 0) (addressing addr VEC128 i 1)
       | Single ->
           I.cvtsd2ss (arg i 0) xmm15;
           I.movss xmm15 (addressing addr REAL4 i 1)
@@ -1112,9 +1175,9 @@ let emit_instr fallthrough i =
       I.movq a0 (res i 0);
       I.neg (res i 0)
   | Lop(Inegf) ->
-      I.xorpd (mem64_rip OWORD (emit_symbol "caml_negf_mask")) (res i 0)
+      I.xorpd (mem64_rip VEC128 (emit_symbol "caml_negf_mask")) (res i 0)
   | Lop(Iabsf) ->
-      I.andpd (mem64_rip OWORD (emit_symbol "caml_absf_mask")) (res i 0)
+      I.andpd (mem64_rip VEC128 (emit_symbol "caml_absf_mask")) (res i 0)
   | Lop(Iaddf | Isubf | Imulf | Idivf as floatop) ->
       instr_for_floatop floatop (arg i 1) (res i 0)
   | Lop(Ifloatofint) ->
@@ -1324,8 +1387,8 @@ let emit_instr fallthrough i =
          we must be careful not to clobber it before use. *)
       let (tmp1, tmp2) =
         if i.arg.(0).loc = Reg 0 (* rax *)
-        then (phys_reg 4 (*rdx*), phys_reg 0 (*rax*))
-        else (phys_reg 0 (*rax*), phys_reg 4 (*rdx*)) in
+        then (phys_reg Int 4 (*rdx*), phys_reg Int 0 (*rax*))
+        else (phys_reg Int 0 (*rax*), phys_reg Int 4 (*rdx*)) in
 
       I.lea (mem64_rip NONE lbl) (reg tmp1);
       I.movsxd (mem64 DWORD 0 (arg64 i 0) ~scale:4 ~base:(reg64 tmp1))
@@ -1415,7 +1478,7 @@ let fundecl fundecl =
   local_realloc_sites := [];
   bound_error_sites := [];
   bound_error_call := 0;
-  for i = 0 to Proc.num_register_classes - 1 do
+  for i = 0 to Proc.num_stack_slot_classes - 1 do
     num_stack_slots.(i) <- fundecl.fun_num_stack_slots.(i);
   done;
   prologue_required := fundecl.fun_prologue_required;
@@ -1474,6 +1537,8 @@ let emit_item = function
   | Cint n -> D.qword (const_nat n)
   | Csingle f -> D.long  (Const (Int64.of_int32 (Int32.bits_of_float f)))
   | Cdouble f -> D.qword (Const (Int64.bits_of_float f))
+  (* SIMD vectors respect little-endian byte order *)
+  | Cvec128 {high; low} -> D.qword (Const low); D.qword (Const high)
   | Csymbol_address s ->
     add_used_symbol s.sym_name;
     D.qword (ConstLabel (emit_cmm_symbol s))
@@ -1554,15 +1619,20 @@ let begin_assembly unix =
 
   ()
 
-let make_stack_loc ~offset i (r : Reg.t) =
+let make_stack_loc ~offset n (r : Reg.t) =
   (* Use "Outgoing" stack locations, instead of "Local",
      because [slot_offset] emits (Outgoing n) directly
      as offset [n] from the stack pointer,
      rather than a calculation relative to the stack frame,
      which is incorrect for naked floats (arising from live variables, not
      probe arguments) in the wrapper's frame. *)
-  let loc = Stack (Outgoing (offset + (8*i))) in
+  let loc = Stack (Outgoing (offset + n)) in
   (* Manufacture stack entry with this register's type *)
+  (match r.typ with
+   | Int | Val | Addr | Float -> ()
+   | Vec128 ->
+    if not !simd_regalloc_support then
+    Misc.fatal_error "SIMD register allocation is not enabled.");
   Reg.at_location r.typ loc
 
 (* CR mshinwell: Not now, but after code review, it would be better to
@@ -1613,6 +1683,24 @@ let make_stack_loc ~offset i (r : Reg.t) =
    This assumption might no longer hold in the presence of unboxed types.
 *)
 
+let size_of_regs regs =
+  Array.fold_right
+    (fun r acc ->
+      match r.Reg.typ with
+      | Int | Addr | Val -> acc + size_int
+      | Float -> acc + size_float
+      | Vec128 -> acc + size_vec128)
+    regs 0
+
+let stack_locations ~offset regs =
+  let _, locs = Array.fold_right (fun r (n, offsets) ->
+    let next = n + match r.Reg.typ with
+      | Int | Val | Addr -> size_int
+      | Float -> size_float
+      | Vec128 -> size_vec128 in
+    next, (make_stack_loc n r ~offset :: offsets)) regs (0, []) in
+  locs |> List.rev |> Array.of_list
+
 let emit_probe_handler_wrapper p =
   let wrap_label = probe_handler_wrapper_name p.probe_label in
   let probe_name, handler_code_sym =
@@ -1626,7 +1714,7 @@ let emit_probe_handler_wrapper p =
      recall that the wrapper does however have its own frame.) *)
   frame_required := true;
   stack_offset := p.stack_offset;
-  for i = 0 to Proc.num_register_classes - 1 do
+  for i = 0 to Proc.num_stack_slot_classes - 1 do
     num_stack_slots.(i) <- p.num_stack_slots.(i);
   done;
   (* Account for the return address that is now pushed on the stack. *)
@@ -1642,18 +1730,16 @@ let emit_probe_handler_wrapper p =
     I.mov rsp rbp;
   end;
   (* Prepare to call the handler: calculate and allocate stack space.
-     The calculations are simplified using the following property. *)
-  assert (size_addr = 8 && size_int = 8 && size_float = 8);
-  (* Compute the size of stack slots for all live hard registers. *)
+     Compute the size of stack slots for all live hard registers. *)
   let live =
     Reg.Set.elements p.probe_insn.live
     |> List.filter Reg.is_reg
     |> Array.of_list
   in
-  let live_offset = 8 * (Array.length live) in
+  let live_offset = size_of_regs live in
   (* Compute the size of stack slots for spilling all arguments of the probe. *)
   let aux_offset = 8 (* for saving r15 *) in
-  let tmp_offset = 8 * (Array.length p.probe_insn.arg) in
+  let tmp_offset = size_of_regs p.probe_insn.arg in
   let loc_args, loc_offset = Proc.loc_arguments (Reg.typv p.probe_insn.arg) in
   (* Ensure the stack is aligned.
      Assuming that the stack at the probe site is 16-byte aligned,
@@ -1669,7 +1755,7 @@ let emit_probe_handler_wrapper p =
   emit_stack_offset n;
   (* Save all live hard registers *)
   let offset = aux_offset + tmp_offset + loc_offset in
-  let saved_live = Array.mapi (make_stack_loc ~offset) live in
+  let saved_live = stack_locations ~offset live in
   Array.iteri (fun i reg -> move reg saved_live.(i)) live;
   (* Spill r15 to free it to be used as a temporary register for stack to
      stack copying. *)
@@ -1678,11 +1764,11 @@ let emit_probe_handler_wrapper p =
   I.mov r15 (reg saved_r15);
   (* Spill all arguments of the probe.  Some of these may already be on the
      stack, in which case a temporary is used for the move. *)
-  let tmp = Array.mapi (make_stack_loc ~offset:loc_offset) p.probe_insn.arg in
-  Array.iteri (fun i reg -> move_allowing_stack_to_stack reg (tmp.(i)))
+  let saved_args = stack_locations ~offset:loc_offset p.probe_insn.arg in
+  Array.iteri (fun i reg -> move_allowing_stack_to_stack reg (saved_args.(i)))
     p.probe_insn.arg;
   (* Load probe arguments to correct locations for the handler *)
-  Array.iteri (fun i reg -> move_allowing_stack_to_stack tmp.(i) reg) loc_args;
+  Array.iteri (fun i reg -> move_allowing_stack_to_stack saved_args.(i) reg) loc_args;
   (* Reload spilled registers used as temporaries *)
   I.mov (reg saved_r15) r15;
   (* Emit call to handler *)
@@ -1696,7 +1782,7 @@ let emit_probe_handler_wrapper p =
       | Stack (Outgoing k) ->
         (match r.typ with
         | Val -> k::acc
-        | Int | Float -> acc
+        | Int | Float | Vec128 -> acc
         | Addr -> Misc.fatal_error ("bad GC root " ^ Reg.name r))
       | _ -> assert false)
     saved_live
@@ -1764,8 +1850,8 @@ let emit_probe_notes0 () =
     let arg_name =
       match arg.loc with
       | Stack s ->
-        Printf.sprintf "%d(%%rsp)" (slot_offset s (register_class arg))
-      | Reg reg -> Proc.register_name reg
+        Printf.sprintf "%d(%%rsp)" (slot_offset s (stack_slot_class arg.Reg.typ))
+      | Reg reg -> Proc.register_name arg.Reg.typ reg
       | Unknown ->
         Misc.fatal_errorf "Cannot create probe: illegal argument: %a"
           Printmach.reg arg
@@ -1865,7 +1951,8 @@ let end_assembly () =
     | _ -> D.section [".rodata.cst8"] (Some "aM") ["@progbits";"8"]
     end;
     D.align ~data:true 8;
-    List.iter (fun (cst,lbl) -> emit_float_constant cst lbl) !float_constants
+    List.iter (fun (cst,lbl) -> emit_float_constant cst lbl) !float_constants;
+    List.iter (fun (cst,lbl) -> emit_vec128_constant cst lbl) !vec128_constants;
   end;
 
   (* Emit probe handler wrappers *)
diff --git a/backend/amd64/proc.ml b/backend/amd64/proc.ml
index 2bb29f951cb..dc08ca0ce12 100644
--- a/backend/amd64/proc.ml
+++ b/backend/amd64/proc.ml
@@ -48,7 +48,7 @@ let win64 = Arch.win64
     r14         domain state pointer
     r15         allocation pointer
 
-  xmm0 - xmm15  100 - 115  *)
+  xmm0 - xmm15  100 - 115 *)
 
 (* Conventions:
      rax - r13: OCaml function arguments
@@ -103,19 +103,47 @@ let register_class r =
   match r.typ with
   | Val | Int | Addr -> 0
   | Float -> 1
+  | Vec128 ->
+    if not !simd_regalloc_support then
+    Misc.fatal_error "SIMD register allocation is not enabled.";
+    1
 
-let register_class_tag c =
+let num_stack_slot_classes = 3
+
+let stack_slot_class typ =
+  match typ with
+  | Val | Addr | Int -> 0
+  | Float -> 1
+  | Vec128 ->
+    if not !simd_regalloc_support then
+    Misc.fatal_error "SIMD register allocation is not enabled.";
+    2
+
+let stack_class_tag c =
   match c with
   | 0 -> "i"
   | 1 -> "f"
-  | c -> Misc.fatal_errorf "Unspecified register class %d" c
+  | 2 ->
+    if not !simd_regalloc_support then
+    Misc.fatal_error "SIMD register allocation is not enabled.";
+    "x"
+  | c -> Misc.fatal_errorf "Unspecified stack slot class %d" c
 
 let num_available_registers = [| 13; 16 |]
 
 let first_available_register = [| 0; 100 |]
 
-let register_name r =
-  if r < 100 then int_reg_name.(r) else float_reg_name.(r - 100)
+let register_name ty r =
+  (* If the ID doesn't match the type, the array access will raise. *)
+  match ty with
+  | Int | Addr | Val ->
+    int_reg_name.(r - first_available_register.(0))
+  | Float ->
+    float_reg_name.(r - first_available_register.(1))
+  | Vec128 ->
+    if not !simd_regalloc_support then
+    Misc.fatal_error "SIMD register allocation is not enabled.";
+    float_reg_name.(r - first_available_register.(1))
 
 (* Pack registers starting at %rax so as to reduce the number of REX
    prefixes and thus improve code density *)
@@ -133,18 +161,34 @@ let hard_float_reg =
   for i = 0 to 15 do v.(i) <- Reg.at_location Float (Reg (100 + i)) done;
   v
 
-let all_phys_regs =
-  Array.append hard_int_reg hard_float_reg
-
-let phys_reg n =
-  if n < 100 then hard_int_reg.(n) else hard_float_reg.(n - 100)
+let hard_vec128_reg =
+  let v = Array.make 16 Reg.dummy in
+  for i = 0 to 15 do v.(i) <- Reg.at_location Vec128 (Reg (100 + i)) done;
+  fun () -> if !simd_regalloc_support then v
+            else Misc.fatal_error "SIMD register allocation is not enabled."
 
-let rax = phys_reg 0
-let rdx = phys_reg 4
-let r10 = phys_reg 10
-let r11 = phys_reg 11
-let rbp = phys_reg 12
-let rxmm15 = phys_reg 115
+let all_phys_regs =
+  let basic_regs = Array.append hard_int_reg hard_float_reg in
+  fun () -> if !simd_regalloc_support then Array.append basic_regs (hard_vec128_reg ())
+            else basic_regs
+
+let phys_reg ty n =
+  match ty with
+  | Int | Addr | Val -> hard_int_reg.(n)
+  | Float -> hard_float_reg.(n - 100)
+  | Vec128 -> (hard_vec128_reg ()).(n - 100)
+
+let rax = phys_reg Int 0
+let rdx = phys_reg Int 4
+let r10 = phys_reg Int 10
+let r11 = phys_reg Int 11
+let rbp = phys_reg Int 12
+
+(* CSE needs to know that all versions of xmm15 are destroyed. *)
+let destroy_xmm15 () =
+  if !simd_regalloc_support
+  then [| phys_reg Float 115; phys_reg Vec128 115 |]
+  else [| phys_reg Float 115 |]
 
 let destroyed_by_plt_stub =
   if not X86_proc.use_plt then [| |] else [| r10; r11 |]
@@ -154,6 +198,10 @@ let num_destroyed_by_plt_stub = Array.length destroyed_by_plt_stub
 let destroyed_by_plt_stub_set = Reg.set_of_array destroyed_by_plt_stub
 
 let stack_slot slot ty =
+  (match ty with
+   | Float | Int | Addr | Val -> ()
+   | Vec128 -> if not !simd_regalloc_support then
+     Misc.fatal_error "SIMD register allocation is not enabled.");
   Reg.at_location ty (Stack slot)
 
 (* Instruction selection *)
@@ -164,8 +212,7 @@ let word_addressed = false
 
 let size_domainstate_args = 64 * size_int
 
-let calling_conventions first_int last_int first_float last_float
-                        make_stack first_stack
+let calling_conventions first_int last_int first_float last_float make_stack first_stack
                         arg =
   let loc = Array.make (Array.length arg) Reg.dummy in
   let int = ref first_int in
@@ -175,7 +222,7 @@ let calling_conventions first_int last_int first_float last_float
     match arg.(i) with
     | Val | Int | Addr as ty ->
         if !int <= last_int then begin
-          loc.(i) <- phys_reg !int;
+          loc.(i) <- phys_reg ty !int;
           incr int
         end else begin
           loc.(i) <- stack_slot (make_stack !ofs) ty;
@@ -184,13 +231,23 @@ let calling_conventions first_int last_int first_float last_float
         assert (not (Reg.Set.mem loc.(i) destroyed_by_plt_stub_set))
     | Float ->
         if !float <= last_float then begin
-          loc.(i) <- phys_reg !float;
+          loc.(i) <- phys_reg Float !float;
           incr float
         end else begin
           loc.(i) <- stack_slot (make_stack !ofs) Float;
           ofs := !ofs + size_float
         end
+    | Vec128 ->
+      if !float <= last_float then begin
+        loc.(i) <- phys_reg Vec128 !float;
+        incr float
+      end else begin
+        ofs := Misc.align !ofs 16;
+        loc.(i) <- stack_slot (make_stack !ofs) Vec128;
+        ofs := !ofs + size_vec128
+      end
   done;
+  (* CR mslater: (SIMD) will need to be 32/64 if vec256/512 are used. *)
   (loc, Misc.align (max 0 !ofs) 16)  (* keep stack 16-aligned *)
 
 let incoming ofs =
@@ -252,7 +309,7 @@ let win64_loc_external_arguments arg =
     match arg.(i) with
     | Val | Int | Addr as ty ->
         if !reg < 4 then begin
-          loc.(i) <- phys_reg win64_int_external_arguments.(!reg);
+          loc.(i) <- phys_reg ty win64_int_external_arguments.(!reg);
           incr reg
         end else begin
           loc.(i) <- stack_slot (Outgoing !ofs) ty;
@@ -260,13 +317,23 @@ let win64_loc_external_arguments arg =
         end
     | Float ->
         if !reg < 4 then begin
-          loc.(i) <- phys_reg win64_float_external_arguments.(!reg);
+          loc.(i) <- phys_reg Float win64_float_external_arguments.(!reg);
           incr reg
         end else begin
           loc.(i) <- stack_slot (Outgoing !ofs) Float;
           ofs := !ofs + size_float
         end
+    | Vec128 ->
+      if !reg < 4 then begin
+        loc.(i) <- phys_reg Vec128 win64_float_external_arguments.(!reg);
+        incr reg
+      end else begin
+        ofs := Misc.align !ofs 16;
+        loc.(i) <- stack_slot (Outgoing !ofs) Vec128;
+        ofs := !ofs + size_vec128
+      end
   done;
+  (* CR mslater: (SIMD) will need to be 32/64 if vec256/512 are used. *)
   (loc, Misc.align !ofs 16)  (* keep stack 16-aligned *)
 
 let loc_external_arguments ty_args =
@@ -302,18 +369,27 @@ let regs_are_volatile _rs = false
 
 (* Registers destroyed by operations *)
 
+let destroyed_at_c_call_win64 =
+  let basic_regs = Array.append
+    (Array.map (phys_reg Int) [|0;4;5;6;7;10;11|])
+    (Array.sub hard_float_reg 0 6)
+  in
+  fun () -> if !simd_regalloc_support
+  then Array.append basic_regs (Array.sub (hard_vec128_reg ()) 0 6)
+  else basic_regs
+
+let destroyed_at_c_call_unix =
+  (* Unix: rbp, rbx, r12-r15 preserved *)
+  let basic_regs = Array.append
+    (Array.map (phys_reg Int) [|0;2;3;4;5;6;7;10;11|])
+    hard_float_reg
+  in
+  fun () -> if !simd_regalloc_support
+  then Array.append basic_regs (hard_vec128_reg ())
+  else basic_regs
+
 let destroyed_at_c_call =
-  if win64 then
-    (* Win64: rbx, rbp, rsi, rdi, r12-r15, xmm6-xmm15 preserved *)
-    Array.of_list(List.map phys_reg
-      [0;4;5;6;7;10;11;
-       100;101;102;103;104;105])
-  else
-    (* Unix: rbp, rbx, r12-r15 preserved *)
-    Array.of_list(List.map phys_reg
-      [0;2;3;4;5;6;7;10;11;
-       100;101;102;103;104;105;106;107;
-       108;109;110;111;112;113;114;115])
+  if win64 then destroyed_at_c_call_win64 else destroyed_at_c_call_unix
 
 let destroyed_at_alloc_or_poll =
   if X86_proc.use_plt then
@@ -329,12 +405,13 @@ let has_pushtrap traps =
 
 (* note: keep this function in sync with `destroyed_at_{basic,terminator}` below. *)
 let destroyed_at_oper = function
-    Iop(Icall_ind | Icall_imm _ | Iextcall { alloc = true; }) ->
-    all_phys_regs
-  | Iop(Iextcall { alloc = false; }) -> destroyed_at_c_call
+    Iop(Icall_ind | Icall_imm _ | Iextcall { alloc = true; })
+        -> all_phys_regs ()
+  | Iop(Iextcall { alloc = false; }) -> destroyed_at_c_call ()
   | Iop(Iintop(Idiv | Imod)) | Iop(Iintop_imm((Idiv | Imod), _))
         -> [| rax; rdx |]
-  | Iop(Istore(Single, _, _)) -> [| rxmm15 |]
+  | Iop(Istore(Single, _, _))
+        -> destroy_xmm15 ()
   | Iop(Ialloc _ | Ipoll _) -> destroyed_at_alloc_or_poll
   | Iop(Iintop(Imulh _ | Icomp _) | Iintop_imm((Icomp _), _))
         -> [| rax |]
@@ -359,13 +436,13 @@ let destroyed_at_oper = function
   | Iop(Iintop_atomic _)
   | Iop(Istore((Byte_unsigned | Byte_signed | Sixteen_unsigned | Sixteen_signed
                | Thirtytwo_unsigned | Thirtytwo_signed | Word_int | Word_val
-               | Double ), _, _))
+               | Double | Onetwentyeight ), _, _))
   | Iop(Imove | Ispill | Ireload | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
        | Icompf _
        | Icsel _
        | Ifloatofint | Iintoffloat
        | Ivalueofint | Iintofvalue
-       | Iconst_int _ | Iconst_float _ | Iconst_symbol _
+       | Iconst_int _ | Iconst_float _ | Iconst_symbol _ | Iconst_vec128 _
        | Itailcall_ind | Itailcall_imm _ | Istackoffset _ | Iload (_, _, _)
        | Iname_for_debugger _ | Iprobe _| Iprobe_is_enabled _ | Iopaque)
   | Iend | Ireturn _ | Iifthenelse (_, _, _) | Icatch (_, _, _, _)
@@ -378,7 +455,6 @@ let destroyed_at_oper = function
     else
       [||]
 
-
 let destroyed_at_raise = all_phys_regs
 
 let destroyed_at_reloadretaddr = [| |]
@@ -393,7 +469,7 @@ let destroyed_at_basic (basic : Cfg_intf.S.basic) =
   | Op (Intop (Idiv | Imod)) | Op (Intop_imm ((Idiv | Imod), _)) ->
     [| rax; rdx |]
   | Op(Store(Single, _, _)) ->
-    [| rxmm15 |]
+    destroy_xmm15 ()
   | Op(Intop(Imulh _ | Icomp _) | Intop_imm((Icomp _), _)) ->
     [| rax |]
   | Op (Specific (Irdtsc | Irdpmc)) ->
@@ -401,12 +477,12 @@ let destroyed_at_basic (basic : Cfg_intf.S.basic) =
   | Op (Intop Icheckbound | Intop_imm (Icheckbound, _)) ->
     assert false
   | Op (Move | Spill | Reload
-       | Const_int _ | Const_float _ | Const_symbol _
+       | Const_int _ | Const_float _ | Const_symbol _ | Const_vec128 _
        | Stackoffset _
        | Load _ | Store ((Byte_unsigned | Byte_signed | Sixteen_unsigned
                          | Sixteen_signed | Thirtytwo_unsigned
                          | Thirtytwo_signed | Word_int | Word_val
-                         | Double ), _, _)
+                         | Double | Onetwentyeight ), _, _)
        | Intop (Iadd | Isub | Imul | Iand | Ior | Ixor | Ilsl | Ilsr
                | Iasr | Ipopcnt | Iclz _ | Ictz _)
        | Intop_imm ((Iadd | Isub | Imul | Imulh _ | Iand | Ior | Ixor
@@ -447,9 +523,9 @@ let destroyed_at_terminator (terminator : Cfg_intf.S.terminator) =
     [| rax; rdx |]
   | Call_no_return { func_symbol = _; alloc; ty_res = _; ty_args = _; }
   | Prim {op = External { func_symbol = _; alloc; ty_res = _; ty_args = _; }; _} ->
-    if alloc then all_phys_regs else destroyed_at_c_call
+    if alloc then all_phys_regs () else destroyed_at_c_call ()
   | Call {op = Indirect | Direct _; _} ->
-    all_phys_regs
+    all_phys_regs ()
   | Specific_can_raise { op = (Ilea _ | Ibswap _ | Isqrtf | Isextend32 | Izextend32
                        | Ifloatarithmem _ | Ifloatsqrtf _
                        | Ifloat_iround | Ifloat_round _ | Ifloat_min | Ifloat_max
@@ -502,7 +578,7 @@ let safe_register_pressure = function
   | Ifloatofint | Iintoffloat | Ivalueofint | Iintofvalue
   | Icompf _
   | Icsel _
-  | Iconst_int _ | Iconst_float _ | Iconst_symbol _
+  | Iconst_int _ | Iconst_float _ | Iconst_symbol _ | Iconst_vec128 _
   | Icall_ind | Icall_imm _ | Itailcall_ind | Itailcall_imm _
   | Istackoffset _ | Iload (_, _, _) | Istore (_, _, _)
   | Iintop _ | Iintop_imm (_, _) | Iintop_atomic _
@@ -536,12 +612,12 @@ let max_register_pressure =
   | Iintop_atomic _
   | Istore((Byte_unsigned | Byte_signed | Sixteen_unsigned | Sixteen_signed
             | Thirtytwo_unsigned | Thirtytwo_signed | Word_int | Word_val
-            | Double ),
+            | Double | Onetwentyeight ),
             _, _)
   | Imove | Ispill | Ireload | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
   | Icsel _
   | Ifloatofint | Iintoffloat | Ivalueofint | Iintofvalue
-  | Iconst_int _ | Iconst_float _ | Iconst_symbol _
+  | Iconst_int _ | Iconst_float _ | Iconst_symbol _ | Iconst_vec128 _
   | Icall_ind | Icall_imm _ | Itailcall_ind | Itailcall_imm _
   | Istackoffset _ | Iload (_, _, _)
   | Ispecific(Ilea _ | Isextend32 | Izextend32 | Iprefetch _ | Ipause
@@ -559,7 +635,9 @@ let max_register_pressure =
 
 let frame_required ~fun_contains_calls ~fun_num_stack_slots =
   fp || fun_contains_calls ||
-  fun_num_stack_slots.(0) > 0 || fun_num_stack_slots.(1) > 0
+  fun_num_stack_slots.(0) > 0 ||
+  fun_num_stack_slots.(1) > 0 ||
+  fun_num_stack_slots.(2) > 0
 
 let prologue_required ~fun_contains_calls ~fun_num_stack_slots =
   frame_required ~fun_contains_calls ~fun_num_stack_slots
@@ -575,6 +653,12 @@ let init () =
   end else
     num_available_registers.(0) <- 13
 
+(* Precolored_regs is not always the same as [all_phys_regs], as some physical registers
+   may not be allocatable (e.g. rbp when frame pointers are enabled). *)
+let precolored_regs () =
+  let phys_regs = Reg.set_of_array (all_phys_regs ()) in
+  if fp then Reg.Set.remove rbp phys_regs else phys_regs
+
 let operation_supported = function
   | Cpopcnt -> !popcnt_support
   | Cprefetch _ | Catomic _
diff --git a/backend/amd64/regalloc_stack_operands.ml b/backend/amd64/regalloc_stack_operands.ml
index c3e4b8251a8..ead88266d43 100644
--- a/backend/amd64/regalloc_stack_operands.ml
+++ b/backend/amd64/regalloc_stack_operands.ml
@@ -192,8 +192,8 @@ let basic (map : spilled_map) (instr : Cfg.basic Cfg.instruction) =
   | Op (Specific (Irdtsc | Irdpmc))
   | Op (Intop (Ipopcnt | Iclz _| Ictz _))
   | Op (Intop_atomic _)
-  | Op (Move | Spill | Reload | Negf | Absf | Const_float _ | Compf _ | Stackoffset _
-       | Load _ | Store _ | Name_for_debugger _ | Probe_is_enabled _
+  | Op (Move | Spill | Reload | Negf | Absf | Const_float _  | Const_vec128 _ | Compf _
+       | Stackoffset _ | Load _ | Store _ | Name_for_debugger _ | Probe_is_enabled _
        | Valueofint | Intofvalue | Opaque | Begin_region | End_region )
   | Op (Specific (Isqrtf | Isextend32 | Izextend32 | Ilea _
                  | Istore_int (_, _, _)
diff --git a/backend/amd64/reload.ml b/backend/amd64/reload.ml
index 37115a3b1c5..35fda48a6f7 100644
--- a/backend/amd64/reload.ml
+++ b/backend/amd64/reload.ml
@@ -178,7 +178,7 @@ method! reload_operation op arg res =
                | Ilfence | Isfence | Imfence
                | Iprefetch _
                | Ibswap _| Ifloatsqrtf _)
-  | Imove|Ispill|Ireload|Inegf|Iabsf|Iconst_float _|Icall_ind|Icall_imm _
+  | Imove|Ispill|Ireload|Inegf|Iabsf|Iconst_float _|Iconst_vec128 _|Icall_ind|Icall_imm _
   | Icompf _
   | Itailcall_ind|Itailcall_imm _|Iextcall _|Istackoffset _|Iload (_, _, _)
   | Istore (_, _, _)|Ialloc _|Iname_for_debugger _|Iprobe _|Iprobe_is_enabled _
diff --git a/backend/amd64/selection.ml b/backend/amd64/selection.ml
index 1cbe721b140..60b15935cc7 100644
--- a/backend/amd64/selection.ml
+++ b/backend/amd64/selection.ml
@@ -87,9 +87,9 @@ let rec select_addr exp =
 
 exception Use_default
 
-let rax = phys_reg 0
-let rcx = phys_reg 5
-let rdx = phys_reg 4
+let rax = phys_reg Int 0
+let rcx = phys_reg Int 5
+let rdx = phys_reg Int 4
 
 let pseudoregs_for_operation op arg res =
   match op with
@@ -170,7 +170,7 @@ let pseudoregs_for_operation op arg res =
               |Ipause|Ilfence|Isfence|Imfence
               |Ioffset_loc (_, _)|Ifloatsqrtf _|Irdtsc|Iprefetch _)
   | Imove|Ispill|Ireload|Ifloatofint|Iintoffloat|Ivalueofint|Iintofvalue
-  | Iconst_int _|Iconst_float _
+  | Iconst_int _|Iconst_float _|Iconst_vec128 _
   | Iconst_symbol _|Icall_ind|Icall_imm _|Itailcall_ind|Itailcall_imm _
   | Iextcall _|Istackoffset _|Iload (_, _, _) | Istore (_, _, _)|Ialloc _
   | Iname_for_debugger _|Iprobe _|Iprobe_is_enabled _ | Iopaque
@@ -263,7 +263,7 @@ method! select_store is_assign addr exp =
       (Ispecific(Istore_int(Nativeint.of_int n, addr, is_assign)), Ctuple [])
   | (Cconst_natint (n, _dbg)) when is_immediate_natint n ->
       (Ispecific(Istore_int(n, addr, is_assign)), Ctuple [])
-  | Cconst_int _
+  | Cconst_int _ | Cconst_vec128 _
   | Cconst_natint (_, _) | Cconst_float (_, _) | Cconst_symbol (_, _)
   | Cvar _ | Clet (_, _, _) | Clet_mut (_, _, _, _) | Cphantom_let (_, _, _)
   | Cassign (_, _) | Ctuple _ | Cop (_, _, _) | Csequence (_, _)
diff --git a/backend/arm64/arch.ml b/backend/arm64/arch.ml
index f61ed7fa36f..a01706c51dd 100644
--- a/backend/arm64/arch.ml
+++ b/backend/arm64/arch.ml
@@ -75,6 +75,8 @@ let size_addr = 8
 let size_int = 8
 let size_float = 8
 
+let size_vec128 = 16
+
 let allow_unaligned_access = false
 
 (* Behavior of division *)
diff --git a/backend/arm64/emit.mlp b/backend/arm64/emit.mlp
index 0a5a259109a..22711e03c1d 100644
--- a/backend/arm64/emit.mlp
+++ b/backend/arm64/emit.mlp
@@ -33,11 +33,11 @@ let fastcode_flag = ref true
 
 (* Names for special regs *)
 
-let reg_domain_state_ptr = phys_reg 25 (* x28 *)
-let reg_trap_ptr = phys_reg 23 (* x26 *)
-let reg_alloc_ptr = phys_reg 24 (* x27 *)
-let reg_tmp1 = phys_reg 26 (* x16 *)
-let reg_x8 = phys_reg 8 (* x8 *)
+let reg_domain_state_ptr = phys_reg Int 25 (* x28 *)
+let reg_trap_ptr = phys_reg Int 23 (* x26 *)
+let reg_alloc_ptr = phys_reg Int 24 (* x27 *)
+let reg_tmp1 = phys_reg Int 26 (* x16 *)
+let reg_x8 = phys_reg Int 8 (* x8 *)
 
 (* Output a label *)
 
@@ -71,7 +71,7 @@ let emit_symbol_size sym =
 (* Output a pseudo-register *)
 
 let emit_reg = function
-    {loc = Reg r} -> emit_string (register_name r)
+    {loc = Reg r; typ} -> emit_string (register_name typ r)
   | _ -> fatal_error "Emit.emit_reg"
 
 (* Likewise, but with the 32-bit name of the register *)
@@ -90,7 +90,7 @@ let emit_wreg = function
 
 let stack_offset = ref 0
 
-let num_stack_slots = Array.make Proc.num_register_classes 0
+let num_stack_slots = Array.make Proc.num_stack_slot_classes 0
 
 let prologue_required = ref false
 
@@ -130,7 +130,7 @@ let emit_stack r =
       let ofs = n + Domainstate.(idx_of_field Domain_extra_params) * 8 in
       `[{emit_reg reg_domain_state_ptr}, #{emit_int ofs}]`
   | Stack s ->
-      let ofs = slot_offset s (register_class r) in
+      let ofs = slot_offset s (stack_slot_class r.typ) in
       `[sp, #{emit_int ofs}]`
   | _ -> fatal_error "Emit.emit_stack"
 
@@ -160,7 +160,7 @@ let record_frame_label live dbg =
       | {typ = Val; loc = Reg r} ->
           live_offset := ((r lsl 1) + 1) :: !live_offset
       | {typ = Val; loc = Stack s} as reg ->
-          live_offset := slot_offset s (register_class reg) :: !live_offset
+          live_offset := slot_offset s (stack_slot_class reg.typ) :: !live_offset
       | {typ = Addr} as r ->
           Misc.fatal_error ("bad GC root " ^ Reg.name r)
       | _ -> ())
@@ -495,6 +495,9 @@ module BR = Branch_relaxation.Make (struct
       num_instructions_for_intconst n
     | Lop (Iconst_float _) -> 2
     | Lop (Iconst_symbol _) -> 2
+    | Lop (Iconst_vec128 _) ->
+      (* CR mslater: (SIMD) arm64 *)
+      Misc.fatal_error "128-bit vectors are not supported on this architecture"
     | Lop (Iintop_atomic _) ->
       (* Never generated; builtins are not yet translated to atomics *)
       assert false
@@ -764,6 +767,9 @@ let emit_instr i =
           let lbl = float_literal f in
           emit_load_literal i.res.(0) lbl
         end
+    | Lop(Iconst_vec128 _) ->
+      (* CR mslater: (SIMD) arm64 *)
+      Misc.fatal_error "128-bit vectors are not supported on this architecture"
     | Lop(Iconst_symbol s) ->
         emit_load_symbol_addr i.res.(0) s.sym_name
     | Lop(Icall_ind) ->
@@ -818,6 +824,8 @@ let emit_instr i =
             `	fcvt	{emit_reg dst}, s7\n`
         | Word_int | Word_val | Double ->
             `	ldr	{emit_reg dst}, {emit_addressing addr base}\n`
+        (* CR mslater: (SIMD) arm64 *)
+        | Onetwentyeight -> fatal_error "arm64: got 128 bit memory chunk"
         end
     | Lop(Istore(size, addr, _)) ->
         let src = i.arg.(0) in
@@ -840,6 +848,8 @@ let emit_instr i =
             `	str	s7, {emit_addressing addr base}\n`;
         | Word_int | Word_val | Double ->
             `	str	{emit_reg src}, {emit_addressing addr base}\n`
+        (* CR mslater: (SIMD) arm64 *)
+        | Onetwentyeight -> fatal_error "arm64: got 128 bit memory chunk"
         end
     | Lop(Ialloc { bytes = n; dbginfo; mode = Alloc_heap }) ->
         assembly_code_for_allocation i ~n ~far:false ~dbginfo
@@ -1128,7 +1138,7 @@ let fundecl fundecl =
   stack_offset := 0;
   call_gc_sites := [];
   bound_error_sites := [];
-    for i = 0 to Proc.num_register_classes - 1 do
+  for i = 0 to Proc.num_stack_slot_classes - 1 do
     num_stack_slots.(i) <- fundecl.fun_num_stack_slots.(i);
   done;
   prologue_required := fundecl.fun_prologue_required;
@@ -1180,6 +1190,9 @@ let emit_item = function
   | Cint n -> `	.quad	{emit_nativeint n}\n`
   | Csingle f -> emit_float32_directive ".long" (Int32.bits_of_float f)
   | Cdouble f -> emit_float64_directive ".quad" (Int64.bits_of_float f)
+  | Cvec128 _ ->
+    (* CR mslater: (SIMD) arm64 *)
+    Misc.fatal_error "128-bit vectors not supported on this architecture"
   | Csymbol_address s -> `	.quad	{emit_symbol s.sym_name}\n`
   | Cstring s -> emit_string_directive "	.ascii  " s
   | Cskip n -> if n > 0 then `	.space	{emit_int n}\n`
diff --git a/backend/arm64/proc.ml b/backend/arm64/proc.ml
index 9deabe5e7ac..7e742cc3955 100644
--- a/backend/arm64/proc.ml
+++ b/backend/arm64/proc.ml
@@ -66,12 +66,23 @@ let register_class r =
   match r.typ with
   | Val | Int | Addr  -> 0
   | Float -> 1
+  (* CR mslater: (SIMD) arm64 *)
+  | Vec128 -> fatal_error "arm64: got vec128 register"
 
-let register_class_tag c =
+let num_stack_slot_classes = 2
+
+let stack_slot_class typ =
+  match typ with
+  | Val | Int | Addr  -> 0
+  | Float -> 1
+  (* CR mslater: (SIMD) arm64 *)
+  | Vec128 -> fatal_error "arm64: got vec128 register"
+
+let stack_class_tag c =
   match c with
   | 0 -> "i"
   | 1 -> "f"
-  | c -> Misc.fatal_errorf "Unspecified register class %d" c
+  | c -> Misc.fatal_errorf "Unspecified stack slot class %d" c
 
 let num_available_registers =
   [| 23; 32 |] (* first 23 int regs allocatable; all float regs allocatable *)
@@ -79,8 +90,14 @@ let num_available_registers =
 let first_available_register =
   [| 0; 100 |]
 
-let register_name r =
-  if r < 100 then int_reg_name.(r) else float_reg_name.(r - 100)
+let register_name ty r =
+  match ty with
+  | Val | Int | Addr ->
+    int_reg_name.(r - first_available_register.(0))
+  | Float ->
+    float_reg_name.(r - first_available_register.(1))
+  (* CR mslater: (SIMD) arm64 *)
+  | Vec128 -> fatal_error "arm64: got vec128 register"
 
 let rotate_registers = true
 
@@ -103,11 +120,19 @@ let hard_float_reg =
 let all_phys_regs =
   Array.append hard_int_reg hard_float_reg
 
-let phys_reg n =
-  if n < 100 then hard_int_reg.(n) else hard_float_reg.(n - 100)
+let precolored_regs =
+  let phys_regs = Reg.set_of_array all_phys_regs in
+  fun () -> phys_regs
+
+let phys_reg ty n =
+  match ty with
+  | Int | Addr | Val -> hard_int_reg.(n)
+  | Float -> hard_float_reg.(n - 100)
+  (* CR mslater: (SIMD) arm64 *)
+  | Vec128 -> fatal_error "arm64: got vec128 register"
 
-let reg_x8 = phys_reg 8
-let reg_d7 = phys_reg 107
+let reg_x8 = phys_reg Int 8
+let reg_d7 = phys_reg Float 107
 
 let stack_slot slot ty =
   Reg.at_location ty (Stack slot)
@@ -118,7 +143,7 @@ let size_domainstate_args = 64 * size_int
 
 let loc_int last_int make_stack int ofs =
   if !int <= last_int then begin
-    let l = phys_reg !int in
+    let l = phys_reg Int !int in
     incr int; l
   end else begin
     ofs := Misc.align !ofs size_int;
@@ -128,7 +153,7 @@ let loc_int last_int make_stack int ofs =
 
 let loc_float last_float make_stack float ofs =
   if !float <= last_float then begin
-    let l = phys_reg !float in
+    let l = phys_reg Float !float in
     incr float; l
   end else begin
     ofs := Misc.align !ofs size_float;
@@ -138,7 +163,7 @@ let loc_float last_float make_stack float ofs =
 
 let loc_int32 last_int make_stack int ofs =
   if !int <= last_int then begin
-    let l = phys_reg !int in
+    let l = phys_reg Int !int in
     incr int; l
   end else begin
     let l = stack_slot (make_stack !ofs) Int in
@@ -158,6 +183,8 @@ let calling_conventions
         loc.(i) <- loc_int last_int make_stack int ofs
     | Float ->
         loc.(i) <- loc_float last_float make_stack float ofs
+    (* CR mslater: (SIMD) arm64 *)
+    | Vec128 -> fatal_error "arm64: got vec128 register"
   done;
   (loc, Misc.align (max 0 !ofs) 16)  (* keep stack 16-aligned *)
 
@@ -220,6 +247,8 @@ let external_calling_conventions
         loc.(i) <- [| loc_int32 last_int make_stack int ofs |]
     | XFloat ->
         loc.(i) <- [| loc_float last_float make_stack float ofs |]
+    (* CR mslater: (SIMD) arm64 *)
+    | XVec128 -> fatal_error "arm64: got vec128 register"
     end)
     ty_args;
   (loc, Misc.align !ofs 16)  (* keep stack 16-aligned *)
@@ -230,7 +259,7 @@ let loc_external_arguments ty_args =
 let loc_external_results res =
   let (loc, _) = calling_conventions 0 1 100 100 not_supported 0 res in loc
 
-let loc_exn_bucket = phys_reg 0
+let loc_exn_bucket = phys_reg Int 0
 
 (* See "DWARF for the ARM 64-bit architecture (AArch64)" available from
    developer.arm.com. *)
@@ -265,11 +294,13 @@ let regs_are_volatile _rs = false
 
 let destroyed_at_c_call =
   (* x19-x28, d8-d15 preserved *)
-  Array.of_list (List.map phys_reg
-    [0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;
-     100;101;102;103;104;105;106;107;
+  Array.append
+  (Array.of_list (List.map (phys_reg Int)
+    [0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15]))
+  (Array.of_list (List.map (phys_reg Float)
+    [100;101;102;103;104;105;106;107;
      116;117;118;119;120;121;122;123;
-     124;125;126;127;128;129;130;131])
+     124;125;126;127;128;129;130;131]))
 
 (* note: keep this function in sync with `destroyed_at_{basic,terminator}` below. *)
 let destroyed_at_oper = function
@@ -284,7 +315,7 @@ let destroyed_at_oper = function
       [| reg_d7 |]            (* d7 / s7 destroyed *)
   | _ -> [||]
 
-let destroyed_at_raise = all_phys_regs
+let destroyed_at_raise () = all_phys_regs
 
 let destroyed_at_reloadretaddr = [| |]
 
diff --git a/backend/arm64/selection.ml b/backend/arm64/selection.ml
index fefbcf75236..2406572cb31 100644
--- a/backend/arm64/selection.ml
+++ b/backend/arm64/selection.ml
@@ -33,7 +33,9 @@ let is_offset chunk n =
     | Thirtytwo_unsigned | Thirtytwo_signed | Single ->
         n land 3 = 0 && n lsr 2 < 0x1000
     | Word_int | Word_val | Double ->
-        n land 7 = 0 && n lsr 3 < 0x1000)
+        n land 7 = 0 && n lsr 3 < 0x1000
+    (* CR mslater: (SIMD) arm64 *)
+    | Onetwentyeight -> Misc.fatal_error "arm64: got 128 bit memory chunk")
 
 let is_logical_immediate n =
   Arch.is_logical_immediate (Nativeint.of_int n)
diff --git a/backend/asm_targets/asm_directives_intf.ml b/backend/asm_targets/asm_directives_intf.ml
index 8a5ec8345c1..46be8c4cb73 100644
--- a/backend/asm_targets/asm_directives_intf.ml
+++ b/backend/asm_targets/asm_directives_intf.ml
@@ -46,6 +46,7 @@ module type Arg = sig
       | NONE
       | DWORD
       | QWORD
+      | VEC128
 
     val file : file_num:int -> file_name:string -> unit
 
diff --git a/backend/cfg/cfg.ml b/backend/cfg/cfg.ml
index 000b4b11b49..2683a18a0f6 100644
--- a/backend/cfg/cfg.ml
+++ b/backend/cfg/cfg.ml
@@ -247,6 +247,8 @@ let dump_op ppf = function
   | Const_int n -> Format.fprintf ppf "const_int %nd" n
   | Const_float f -> Format.fprintf ppf "const_float %F" (Int64.float_of_bits f)
   | Const_symbol s -> Format.fprintf ppf "const_symbol %s" s.sym_name
+  | Const_vec128 { high; low } ->
+    Format.fprintf ppf "const vec128 %016Lx:%016Lx" high low
   | Stackoffset n -> Format.fprintf ppf "stackoffset %d" n
   | Load _ -> Format.fprintf ppf "load"
   | Store _ -> Format.fprintf ppf "store"
@@ -449,6 +451,7 @@ let is_pure_operation : operation -> bool = function
   | Const_int _ -> true
   | Const_float _ -> true
   | Const_symbol _ -> true
+  | Const_vec128 _ -> true
   | Stackoffset _ -> false
   | Load _ -> true
   | Store _ -> false
@@ -498,11 +501,16 @@ let is_pure_basic : basic -> bool = function
 
 let is_noop_move instr =
   match instr.desc with
-  | Op (Move | Spill | Reload) ->
-    (match instr.arg.(0).loc with
+  | Op (Move | Spill | Reload) -> (
+    match instr.arg.(0).loc with
     | Unknown -> false
-    | Reg _ | Stack _ -> Reg.same_loc instr.arg.(0) instr.res.(0))
-    && Proc.register_class instr.arg.(0) = Proc.register_class instr.res.(0)
+    | Reg _ ->
+      Reg.same_loc instr.arg.(0) instr.res.(0)
+      && Proc.register_class instr.arg.(0) = Proc.register_class instr.res.(0)
+    | Stack _ ->
+      Reg.same_loc instr.arg.(0) instr.res.(0)
+      && Proc.stack_slot_class instr.arg.(0).typ
+         = Proc.stack_slot_class instr.res.(0).typ)
   | Op (Csel _) -> (
     match instr.res.(0).loc with
     | Unknown -> false
@@ -512,11 +520,12 @@ let is_noop_move instr =
       let ifnot = instr.arg.(len - 1) in
       Reg.same_loc instr.res.(0) ifso && Reg.same_loc instr.res.(0) ifnot)
   | Op
-      ( Const_int _ | Const_float _ | Const_symbol _ | Stackoffset _ | Load _
-      | Store _ | Intop _ | Intop_imm _ | Intop_atomic _ | Negf | Absf | Addf
-      | Subf | Mulf | Divf | Compf _ | Floatofint | Intoffloat | Opaque
-      | Valueofint | Intofvalue | Probe_is_enabled _ | Specific _
-      | Name_for_debugger _ | Begin_region | End_region )
+      ( Const_int _ | Const_float _ | Const_symbol _ | Const_vec128 _
+      | Stackoffset _ | Load _ | Store _ | Intop _ | Intop_imm _
+      | Intop_atomic _ | Negf | Absf | Addf | Subf | Mulf | Divf | Compf _
+      | Floatofint | Intoffloat | Opaque | Valueofint | Intofvalue
+      | Probe_is_enabled _ | Specific _ | Name_for_debugger _ | Begin_region
+      | End_region )
   | Reloadretaddr | Pushtrap _ | Poptrap | Prologue ->
     false
 
diff --git a/backend/cfg/cfg_intf.ml b/backend/cfg/cfg_intf.ml
index 1e1be5e6aac..02e6d06447b 100644
--- a/backend/cfg/cfg_intf.ml
+++ b/backend/cfg/cfg_intf.ml
@@ -62,6 +62,7 @@ module S = struct
     | Const_int of nativeint (* CR-someday xclerc: change to `Targetint.t` *)
     | Const_float of int64
     | Const_symbol of Cmm.symbol
+    | Const_vec128 of Cmm.vec128_bits
     | Stackoffset of int
     | Load of Cmm.memory_chunk * Arch.addressing_mode * Mach.mutable_flag
     | Store of Cmm.memory_chunk * Arch.addressing_mode * bool
diff --git a/backend/cfg/cfg_to_linear_desc.ml b/backend/cfg/cfg_to_linear_desc.ml
index d034ccf4578..1391f887ff0 100644
--- a/backend/cfg/cfg_to_linear_desc.ml
+++ b/backend/cfg/cfg_to_linear_desc.ml
@@ -15,6 +15,7 @@ let from_basic (basic : basic) : Linear.instruction_desc =
       | Const_int n -> Iconst_int n
       | Const_float n -> Iconst_float n
       | Const_symbol n -> Iconst_symbol n
+      | Const_vec128 bits -> Iconst_vec128 bits
       | Stackoffset n -> Istackoffset n
       | Load (c, m, i) -> Iload (c, m, i)
       | Store (c, m, b) -> Istore (c, m, b)
diff --git a/backend/cfg/cfgize.ml b/backend/cfg/cfgize.ml
index 5bd294c1232..9b54bda1556 100644
--- a/backend/cfg/cfgize.ml
+++ b/backend/cfg/cfgize.ml
@@ -144,6 +144,7 @@ let basic_or_terminator_of_operation :
   | Iconst_int i -> Basic (Op (Const_int i))
   | Iconst_float f -> Basic (Op (Const_float f))
   | Iconst_symbol s -> Basic (Op (Const_symbol s))
+  | Iconst_vec128 bits -> Basic (Op (Const_vec128 bits))
   | Icall_ind ->
     With_next_label (fun label_after -> Call { op = Indirect; label_after })
   | Icall_imm { func } ->
@@ -670,10 +671,11 @@ module Stack_offset_and_exn = struct
     | Op (Stackoffset n) -> stack_offset + n, traps
     | Op
         ( Move | Spill | Reload | Const_int _ | Const_float _ | Const_symbol _
-        | Load _ | Store _ | Intop _ | Intop_imm _ | Intop_atomic _ | Negf
-        | Absf | Addf | Subf | Mulf | Divf | Compf _ | Floatofint | Intoffloat
-        | Valueofint | Csel _ | Intofvalue | Probe_is_enabled _ | Opaque
-        | Begin_region | End_region | Specific _ | Name_for_debugger _ )
+        | Const_vec128 _ | Load _ | Store _ | Intop _ | Intop_imm _
+        | Intop_atomic _ | Negf | Absf | Addf | Subf | Mulf | Divf | Compf _
+        | Floatofint | Intoffloat | Valueofint | Csel _ | Intofvalue
+        | Probe_is_enabled _ | Opaque | Begin_region | End_region | Specific _
+        | Name_for_debugger _ )
     | Reloadretaddr | Prologue ->
       stack_offset, traps
 
diff --git a/backend/cfg/linear_to_cfg.ml b/backend/cfg/linear_to_cfg.ml
index bb6298a109c..ec063de200c 100644
--- a/backend/cfg/linear_to_cfg.ml
+++ b/backend/cfg/linear_to_cfg.ml
@@ -616,6 +616,7 @@ let rec create_blocks (t : t) (i : L.instruction) (block : C.basic_block)
     | Iconst_int n -> basic (Const_int n)
     | Iconst_float n -> basic (Const_float n)
     | Iconst_symbol n -> basic (Const_symbol n)
+    | Iconst_vec128 bits -> basic (Const_vec128 bits)
     | Inegf -> basic Negf
     | Iabsf -> basic Absf
     | Iaddf -> basic Addf
diff --git a/backend/checkmach.ml b/backend/checkmach.ml
index c3a27f70871..dc72562d34e 100644
--- a/backend/checkmach.ml
+++ b/backend/checkmach.ml
@@ -932,8 +932,8 @@ end = struct
   let transform_operation t (op : Mach.operation) ~next ~exn dbg =
     match op with
     | Imove | Ispill | Ireload | Iconst_int _ | Iconst_float _ | Iconst_symbol _
-    | Iload _ | Icompf _ | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
-    | Ifloatofint | Iintoffloat
+    | Iconst_vec128 _ | Iload _ | Icompf _ | Inegf | Iabsf | Iaddf | Isubf
+    | Imulf | Idivf | Ifloatofint | Iintoffloat
     | Iintop_imm
         ( ( Iadd | Isub | Imul | Imulh _ | Idiv | Imod | Iand | Ior | Ixor
           | Ilsl | Ilsr | Iasr | Ipopcnt | Iclz _ | Ictz _ | Icomp _ ),
diff --git a/backend/cmm.ml b/backend/cmm.ml
index 87034fab329..f1e7376a2d7 100644
--- a/backend/cmm.ml
+++ b/backend/cmm.ml
@@ -18,6 +18,7 @@ type machtype_component = Cmx_format.machtype_component =
   | Addr
   | Int
   | Float
+  | Vec128
 
 type machtype = machtype_component array
 
@@ -26,10 +27,11 @@ let typ_val = [|Val|]
 let typ_addr = [|Addr|]
 let typ_int = [|Int|]
 let typ_float = [|Float|]
+let typ_vec128 = [|Vec128|]
 
 (** [machtype_component]s are partially ordered as follows:
 
-      Addr     Float
+      Addr     Float     Vec128
        ^
        |
       Val
@@ -56,8 +58,11 @@ let lub_component comp1 comp2 =
   | Addr, Addr -> Addr
   | Addr, Val -> Addr
   | Float, Float -> Float
-  | (Int | Addr | Val), Float
-  | Float, (Int | Addr | Val) ->
+  | Vec128, Vec128 -> Vec128
+  | (Int | Addr | Val), (Float | Vec128)
+  | (Float | Vec128), (Int | Addr | Val)
+  | Float, Vec128
+  | Vec128, Float ->
     (* Float unboxing code must be sure to avoid this case. *)
     assert false
 
@@ -73,8 +78,11 @@ let ge_component comp1 comp2 =
   | Addr, Addr -> true
   | Addr, Val -> true
   | Float, Float -> true
-  | (Int | Addr | Val), Float
-  | Float, (Int | Addr | Val) ->
+  | Vec128, Vec128 -> true
+  | (Int | Addr | Val), (Float | Vec128)
+  | (Float | Vec128), (Int | Addr | Val)
+  | Float, Vec128
+  | Vec128, Float ->
     assert false
 
 type exttype =
@@ -82,12 +90,14 @@ type exttype =
   | XInt32
   | XInt64
   | XFloat
+  | XVec128
 
 let machtype_of_exttype = function
   | XInt -> typ_int
   | XInt32 -> typ_int
   | XInt64 -> if Arch.size_int = 4 then [|Int;Int|] else typ_int
   | XFloat -> typ_float
+  | XVec128 -> typ_vec128
 
 let machtype_of_exttype_list xtl =
   Array.concat (List.map machtype_of_exttype xtl)
@@ -182,6 +192,7 @@ type memory_chunk =
   | Word_val
   | Single
   | Double
+  | Onetwentyeight
 
 and operation =
     Capply of machtype * Lambda.region_close
@@ -225,6 +236,7 @@ and operation =
 type kind_for_unboxing =
   | Any
   | Boxed_integer of Lambda.boxed_integer
+  | Boxed_vector of Lambda.boxed_vector
   | Boxed_float
 
 type is_global = Global | Local
@@ -238,12 +250,15 @@ type symbol =
   { sym_name : string;
     sym_global : is_global }
 
+type vec128_bits = { low : int64; high: int64 }
+
 let global_symbol sym_name = { sym_name; sym_global = Global }
 
 type expression =
     Cconst_int of int * Debuginfo.t
   | Cconst_natint of nativeint * Debuginfo.t
   | Cconst_float of float * Debuginfo.t
+  | Cconst_vec128 of vec128_bits * Debuginfo.t
   | Cconst_symbol of symbol * Debuginfo.t
   | Cvar of Backend_var.t
   | Clet of Backend_var.With_provenance.t * expression * expression
@@ -298,6 +313,7 @@ type data_item =
   | Cint of nativeint
   | Csingle of float
   | Cdouble of float
+  | Cvec128 of vec128_bits
   | Csymbol_address of symbol
   | Cstring of string
   | Cskip of int
@@ -342,6 +358,7 @@ let iter_shallow_tail f = function
   | Cconst_int _
   | Cconst_natint _
   | Cconst_float _
+  | Cconst_vec128 _
   | Cconst_symbol _
   | Cvar _
   | Cassign _
@@ -384,6 +401,7 @@ let map_shallow_tail ?kind f = function
   | Cconst_int _
   | Cconst_natint _
   | Cconst_float _
+  | Cconst_vec128 _
   | Cconst_symbol _
   | Cvar _
   | Cassign _
@@ -440,6 +458,7 @@ let iter_shallow f = function
   | Cconst_int _
   | Cconst_natint _
   | Cconst_float _
+  | Cconst_vec128 _
   | Cconst_symbol _
   | Cvar _ ->
       ()
@@ -477,6 +496,7 @@ let map_shallow f = function
   | Cconst_int _
   | Cconst_natint _
   | Cconst_float _
+  | Cconst_vec128 _
   | Cconst_symbol _
   | Cvar _
     as c ->
@@ -488,10 +508,12 @@ let equal_machtype_component left right =
   | Addr, Addr -> true
   | Int, Int -> true
   | Float, Float -> true
-  | Val, (Addr | Int | Float)
-  | Addr, (Val | Int | Float)
-  | Int, (Val | Addr | Float)
-  | Float, (Val | Addr | Int) ->
+  | Vec128, Vec128 -> true
+  | Val, (Addr | Int | Float | Vec128)
+  | Addr, (Val | Int | Float | Vec128)
+  | Int, (Val | Addr | Float | Vec128)
+  | Float, (Val | Addr | Int | Vec128)
+  | Vec128, (Val | Addr | Int | Float) ->
     false
 
 let equal_exttype left right =
@@ -500,10 +522,12 @@ let equal_exttype left right =
   | XInt32, XInt32 -> true
   | XInt64, XInt64 -> true
   | XFloat, XFloat -> true
-  | XInt, (XInt32 | XInt64 | XFloat)
-  | XInt32, (XInt | XInt64 | XFloat)
-  | XInt64, (XInt | XInt32 | XFloat)
-  | XFloat, (XInt | XInt32 | XInt64) ->
+  | XVec128, XVec128 -> true
+  | XInt, (XInt32 | XInt64 | XFloat | XVec128)
+  | XInt32, (XInt | XInt64 | XFloat | XVec128)
+  | XInt64, (XInt | XInt32 | XFloat | XVec128)
+  | XFloat, (XInt | XInt32 | XInt64 | XVec128)
+  | XVec128, (XInt | XInt32 | XInt64 | XFloat) ->
     false
 
 let equal_float_comparison left right =
@@ -542,26 +566,40 @@ let equal_memory_chunk left right =
   | Word_val, Word_val -> true
   | Single, Single -> true
   | Double, Double -> true
+  | Onetwentyeight, Onetwentyeight -> true
   | Byte_unsigned, (Byte_signed | Sixteen_unsigned | Sixteen_signed | Thirtytwo_unsigned
-                   | Thirtytwo_signed | Word_int | Word_val | Single | Double)
+                   | Thirtytwo_signed | Word_int | Word_val | Single | Double
+                   | Onetwentyeight)
   | Byte_signed, (Byte_unsigned | Sixteen_unsigned | Sixteen_signed | Thirtytwo_unsigned
-                 | Thirtytwo_signed | Word_int | Word_val | Single | Double)
+                 | Thirtytwo_signed | Word_int | Word_val | Single | Double
+                 | Onetwentyeight)
   | Sixteen_unsigned, (Byte_unsigned | Byte_signed | Sixteen_signed | Thirtytwo_unsigned
-                      | Thirtytwo_signed | Word_int | Word_val | Single | Double)
+                      | Thirtytwo_signed | Word_int | Word_val | Single | Double
+                      | Onetwentyeight)
   | Sixteen_signed, (Byte_unsigned | Byte_signed | Sixteen_unsigned | Thirtytwo_unsigned
-                    | Thirtytwo_signed | Word_int | Word_val | Single | Double)
+                    | Thirtytwo_signed | Word_int | Word_val | Single | Double
+                    | Onetwentyeight)
   | Thirtytwo_unsigned, (Byte_unsigned | Byte_signed | Sixteen_unsigned | Sixteen_signed
-                        | Thirtytwo_signed | Word_int | Word_val | Single | Double)
+                        | Thirtytwo_signed | Word_int | Word_val | Single | Double
+                        | Onetwentyeight)
   | Thirtytwo_signed, (Byte_unsigned | Byte_signed | Sixteen_unsigned | Sixteen_signed
-                      | Thirtytwo_unsigned | Word_int | Word_val | Single | Double)
+                      | Thirtytwo_unsigned | Word_int | Word_val | Single | Double
+                      | Onetwentyeight)
   | Word_int, (Byte_unsigned | Byte_signed | Sixteen_unsigned | Sixteen_signed
-              | Thirtytwo_unsigned | Thirtytwo_signed | Word_val | Single | Double)
+              | Thirtytwo_unsigned | Thirtytwo_signed | Word_val | Single | Double
+              | Onetwentyeight)
   | Word_val, (Byte_unsigned | Byte_signed | Sixteen_unsigned | Sixteen_signed
-              | Thirtytwo_unsigned | Thirtytwo_signed | Word_int | Single | Double)
+              | Thirtytwo_unsigned | Thirtytwo_signed | Word_int | Single | Double
+              | Onetwentyeight)
   | Single, (Byte_unsigned | Byte_signed | Sixteen_unsigned | Sixteen_signed
-            | Thirtytwo_unsigned | Thirtytwo_signed | Word_int | Word_val | Double)
+            | Thirtytwo_unsigned | Thirtytwo_signed | Word_int | Word_val | Double
+            | Onetwentyeight)
   | Double, (Byte_unsigned | Byte_signed | Sixteen_unsigned | Sixteen_signed
-            | Thirtytwo_unsigned | Thirtytwo_signed | Word_int | Word_val | Single) ->
+            | Thirtytwo_unsigned | Thirtytwo_signed | Word_int | Word_val | Single
+            | Onetwentyeight)
+  | Onetwentyeight, (Byte_unsigned | Byte_signed | Sixteen_unsigned | Sixteen_signed
+            | Thirtytwo_unsigned | Thirtytwo_signed | Word_int | Word_val | Single
+            | Double) ->
     false
 
 let equal_integer_comparison left right =
diff --git a/backend/cmm.mli b/backend/cmm.mli
index 4afc8c87c1f..019c7603587 100644
--- a/backend/cmm.mli
+++ b/backend/cmm.mli
@@ -20,6 +20,7 @@ type machtype_component = Cmx_format.machtype_component =
   | Addr
   | Int
   | Float
+  | Vec128
 
 (* - [Val] denotes a valid OCaml value: either a pointer to the beginning
      of a heap block, an infix pointer if it is preceded by the correct
@@ -54,6 +55,7 @@ val typ_val: machtype
 val typ_addr: machtype
 val typ_int: machtype
 val typ_float: machtype
+val typ_vec128: machtype
 
 (** Least upper bound of two [machtype_component]s. *)
 val lub_component
@@ -73,6 +75,7 @@ type exttype =
   | XInt32                              (**r 32-bit integer *)
   | XInt64                              (**r 64-bit integer  *)
   | XFloat                              (**r double-precision FP number  *)
+  | XVec128                             (**r 128-bit vector *)
 (** A variant of [machtype] used to describe arguments
     to external C functions *)
 
@@ -178,6 +181,9 @@ type memory_chunk =
   | Single
   | Double                             (* word-aligned 64-bit float
                                           see PR#10433 *)
+  | Onetwentyeight                     (* word-aligned 128-bit vector
+                                          CR mslater: (SIMD) alignment *)
+
 and operation =
     Capply of machtype * Lambda.region_close
   | Cextcall of
@@ -229,6 +235,7 @@ and operation =
 type kind_for_unboxing =
   | Any (* This may contain anything, including non-scannable things *)
   | Boxed_integer of Lambda.boxed_integer
+  | Boxed_vector of Lambda.boxed_vector
   | Boxed_float
 
 type is_global = Global | Local
@@ -250,6 +257,10 @@ type symbol =
   { sym_name : string;
     sym_global : is_global }
 
+(* SIMD vectors are untyped in the backend.
+   This record holds the bitwise representation of a 128-bit value. *)
+type vec128_bits = { low : int64; high: int64 }
+
 val global_symbol : string -> symbol
 
 (** Every basic block should have a corresponding [Debuginfo.t] for its
@@ -258,6 +269,7 @@ type expression =
     Cconst_int of int * Debuginfo.t
   | Cconst_natint of nativeint * Debuginfo.t
   | Cconst_float of float * Debuginfo.t
+  | Cconst_vec128 of vec128_bits * Debuginfo.t
   | Cconst_symbol of symbol * Debuginfo.t
   | Cvar of Backend_var.t
   | Clet of Backend_var.With_provenance.t * expression * expression
@@ -315,6 +327,7 @@ type data_item =
   | Cint of nativeint
   | Csingle of float
   | Cdouble of float
+  | Cvec128 of vec128_bits
   | Csymbol_address of symbol
   | Cstring of string
   | Cskip of int
diff --git a/backend/cmm_helpers.ml b/backend/cmm_helpers.ml
index a69028ddefb..090343df451 100644
--- a/backend/cmm_helpers.ml
+++ b/backend/cmm_helpers.ml
@@ -81,6 +81,11 @@ let float_header = block_header Obj.double_tag (size_float / size_addr)
 let float_local_header =
   local_block_header Obj.double_tag (size_float / size_addr)
 
+let boxedvec128_header = block_header Obj.abstract_tag (size_vec128 / size_addr)
+
+let boxedvec128_local_header =
+  local_block_header Obj.abstract_tag (size_vec128 / size_addr)
+
 let floatarray_header len =
   (* Zero-sized float arrays have tag zero for consistency with
      [caml_alloc_float_array]. *)
@@ -144,6 +149,15 @@ let alloc_float_header mode dbg =
   | Lambda.Alloc_heap -> Cconst_natint (float_header, dbg)
   | Lambda.Alloc_local -> Cconst_natint (float_local_header, dbg)
 
+let alloc_boxedvector_header vi mode dbg =
+  let header, local_header =
+    match vi with
+    | Primitive.Pvec128 -> boxedvec128_header, boxedvec128_local_header
+  in
+  match mode with
+  | Lambda.Alloc_heap -> Cconst_natint (header, dbg)
+  | Lambda.Alloc_local -> Cconst_natint (local_header, dbg)
+
 let alloc_floatarray_header len dbg = Cconst_natint (floatarray_header len, dbg)
 
 let alloc_closure_header ~mode sz dbg =
@@ -699,6 +713,41 @@ let rec unbox_float dbg =
     | Ctail e -> Ctail (unbox_float dbg e)
     | cmm -> Cop (Cload (Double, Immutable), [cmm], dbg))
 
+(* Vectors *)
+
+let box_vector dbg vi m c =
+  Cop (Calloc m, [alloc_boxedvector_header vi m dbg; c], dbg)
+
+let rec unbox_vec128 dbg =
+  map_tail ~kind:Any (function
+    | Cop (Calloc _, [Cconst_natint (hdr, _); c], _)
+      when Nativeint.equal hdr boxedvec128_header
+           || Nativeint.equal hdr boxedvec128_local_header ->
+      c
+    | Cconst_symbol (s, _dbg) as cmm -> (
+      match Cmmgen_state.structured_constant_of_sym s.sym_name with
+      | Some (Uconst_vec128 { low; high }) ->
+        Cconst_vec128 ({ low; high }, dbg) (* or keep _dbg? *)
+      | _ -> Cop (Cload (Onetwentyeight, Immutable), [cmm], dbg))
+    | Cregion e as cmm -> (
+      (* It is valid to push unboxing inside a Cregion except when the extra
+         unboxing logic pushes a tail call out of tail position *)
+      match
+        map_tail ~kind:Any
+          (function
+            | Cop (Capply (_, Rc_close_at_apply), _, _) -> raise Exit
+            | Ctail e -> Ctail (unbox_vec128 dbg e)
+            | e -> unbox_vec128 dbg e)
+          e
+      with
+      | e -> Cregion e
+      | exception Exit -> Cop (Cload (Onetwentyeight, Immutable), [cmm], dbg))
+    | Ctail e -> Ctail (unbox_vec128 dbg e)
+    | cmm -> Cop (Cload (Onetwentyeight, Immutable), [cmm], dbg))
+
+let unbox_vector dbg vi e =
+  match vi with Primitive.Pvec128 -> unbox_vec128 dbg e
+
 (* Complex *)
 
 let box_complex dbg c_re c_im =
@@ -1019,6 +1068,7 @@ module Extended_machtype_component = struct
     | Tagged_int
     | Any_int
     | Float
+    | Vec128
 
   let of_machtype_component (component : machtype_component) =
     match component with
@@ -1026,6 +1076,7 @@ module Extended_machtype_component = struct
     | Addr -> Addr
     | Int -> Any_int
     | Float -> Float
+    | Vec128 -> Vec128
 
   let to_machtype_component t : machtype_component =
     match t with
@@ -1033,6 +1084,7 @@ module Extended_machtype_component = struct
     | Addr -> Addr
     | Tagged_int | Any_int -> Int
     | Float -> Float
+    | Vec128 -> Vec128
 
   let change_tagged_int_to_val t : machtype_component =
     match t with
@@ -1041,6 +1093,7 @@ module Extended_machtype_component = struct
     | Tagged_int -> Val
     | Any_int -> Int
     | Float -> Float
+    | Vec128 -> Vec128
 end
 
 module Extended_machtype = struct
@@ -1056,6 +1109,8 @@ module Extended_machtype = struct
 
   let typ_float = [| Extended_machtype_component.Float |]
 
+  let typ_vec128 = [| Extended_machtype_component.Vec128 |]
+
   let typ_void = [||]
 
   let of_machtype machtype =
@@ -1073,6 +1128,7 @@ module Extended_machtype = struct
     | Pbottom ->
       Misc.fatal_error "No unique Extended_machtype for layout [Pbottom]"
     | Punboxed_float -> typ_float
+    | Punboxed_vector Pvec128 -> typ_vec128
     | Punboxed_int _ ->
       (* Only 64-bit architectures, so this is always [typ_int] *)
       typ_any_int
@@ -1093,6 +1149,7 @@ let machtype_identifier t =
     | Val -> 'V'
     | Int -> 'I'
     | Float -> 'F'
+    | Vec128 -> 'X'
     | Addr ->
       Misc.fatal_error "[Addr] is forbidden inside arity for generic functions"
   in
@@ -2738,17 +2795,19 @@ let tuplify_function arity return =
 
 let max_arity_optimized = 15
 
+let ints_per_float = size_float / Arch.size_int
+
+let ints_per_vec128 = size_vec128 / Arch.size_int
+
 let machtype_stored_size t =
-  if Arch.size_int = 4
-  then
-    Array.fold_left
-      (fun cur c ->
-        match c with
-        | Addr -> Misc.fatal_error "[Addr] cannot be stored"
-        | Val | Int -> cur + 1
-        | Float -> cur + 2)
-      0 t
-  else Array.length t
+  Array.fold_left
+    (fun cur c ->
+      match c with
+      | Addr -> Misc.fatal_error "[Addr] cannot be stored"
+      | Val | Int -> cur + 1
+      | Float -> cur + ints_per_float
+      | Vec128 -> cur + ints_per_vec128)
+    0 t
 
 let machtype_non_scanned_size t =
   Array.fold_left
@@ -2757,7 +2816,8 @@ let machtype_non_scanned_size t =
       | Addr -> Misc.fatal_error "[Addr] cannot be stored"
       | Val -> cur
       | Int -> cur + 1
-      | Float -> cur + if Arch.size_int = 4 then 2 else 1)
+      | Float -> cur + ints_per_float
+      | Vec128 -> cur + ints_per_vec128)
     0 t
 
 let make_tuple l = match l with [e] -> e | _ -> Ctuple l
@@ -2766,7 +2826,10 @@ let value_slot_given_machtype vs =
   let non_scanned, scanned =
     List.partition
       (fun (_, c) ->
-        match c with Int | Float -> true | Val -> false | Addr -> assert false)
+        match c with
+        | Int | Float | Vec128 -> true
+        | Val -> false
+        | Addr -> assert false)
       vs
   in
   List.map (fun (v, _) -> Cvar v) (non_scanned @ scanned)
@@ -2782,8 +2845,11 @@ let read_from_closure_given_machtype t clos base_offset dbg =
         | Int ->
           (non_scanned_pos + 1, scanned_pos), load Word_int non_scanned_pos
         | Float ->
-          ( ((non_scanned_pos + if Arch.size_int = 4 then 2 else 1), scanned_pos),
+          ( (non_scanned_pos + ints_per_float, scanned_pos),
             load Double non_scanned_pos )
+        | Vec128 ->
+          ( (non_scanned_pos + ints_per_vec128, scanned_pos),
+            load Onetwentyeight non_scanned_pos )
         | Val -> (non_scanned_pos, scanned_pos + 1), load Word_val scanned_pos
         | Addr -> Misc.fatal_error "[Addr] cannot be read")
       (base_offset, base_offset + machtype_non_scanned_size t)
@@ -3698,6 +3764,9 @@ let emit_nativeint_constant symb n cont =
   emit_block symb boxedintnat_header
     (emit_boxed_nativeint_constant_fields n cont)
 
+let emit_vec128_constant symb bits cont =
+  emit_block symb boxedvec128_header (Cvec128 bits :: cont)
+
 let emit_float_array_constant symb fields cont =
   emit_block symb
     (floatarray_header (List.length fields))
@@ -4056,6 +4125,8 @@ let int32 ~dbg i = natint_const_untagged dbg (Nativeint.of_int32 i)
    cross-compiling for 64-bit on a 32-bit host *)
 let int64 ~dbg i = natint_const_untagged dbg (Int64.to_nativeint i)
 
+let vec128 ~dbg bits = Cconst_vec128 (bits, dbg)
+
 let nativeint ~dbg i = natint_const_untagged dbg i
 
 let letin v ~defining_expr ~body =
@@ -4063,9 +4134,9 @@ let letin v ~defining_expr ~body =
   | Cvar v' when Backend_var.same (Backend_var.With_provenance.var v) v' ->
     defining_expr
   | Cvar _ | Cconst_int _ | Cconst_natint _ | Cconst_float _ | Cconst_symbol _
-  | Clet _ | Clet_mut _ | Cphantom_let _ | Cassign _ | Ctuple _ | Cop _
-  | Csequence _ | Cifthenelse _ | Cswitch _ | Ccatch _ | Cexit _ | Ctrywith _
-  | Cregion _ | Ctail _ ->
+  | Cconst_vec128 _ | Clet _ | Clet_mut _ | Cphantom_let _ | Cassign _
+  | Ctuple _ | Cop _ | Csequence _ | Cifthenelse _ | Cswitch _ | Ccatch _
+  | Cexit _ | Ctrywith _ | Cregion _ | Ctail _ ->
     Clet (v, defining_expr, body)
 
 let letin_mut v ty e body = Clet_mut (v, ty, e, body)
@@ -4319,6 +4390,8 @@ let cint i = Cmm.Cint i
 
 let cfloat f = Cmm.Cdouble f
 
+let cvec128 bits = Cmm.Cvec128 bits
+
 let symbol_address s = Cmm.Csymbol_address s
 
 let define_symbol symbol = [Cdefine_symbol symbol]
@@ -4354,7 +4427,7 @@ let cmm_arith_size (e : Cmm.expression) =
   in
   match e with
   | Cconst_int _ | Cconst_natint _ | Cconst_float _ | Cconst_symbol _ | Cvar _
-    ->
+  | Cconst_vec128 _ ->
     Some 0
   | Cop _ -> Some (cmm_arith_size0 e)
   | Clet _ | Clet_mut _ | Cphantom_let _ | Cassign _ | Ctuple _ | Csequence _
@@ -4375,6 +4448,7 @@ let kind_of_layout (layout : Lambda.layout) =
   match layout with
   | Pvalue Pfloatval -> Boxed_float
   | Pvalue (Pboxedintval bi) -> Boxed_integer bi
+  | Pvalue (Pboxedvectorval vi) -> Boxed_vector vi
   | Pvalue (Pgenval | Pintval | Pvariant _ | Parrayval _)
-  | Ptop | Pbottom | Punboxed_float | Punboxed_int _ ->
+  | Ptop | Pbottom | Punboxed_float | Punboxed_int _ | Punboxed_vector _ ->
     Any
diff --git a/backend/cmm_helpers.mli b/backend/cmm_helpers.mli
index 0d6b3816a73..ae606d726ab 100644
--- a/backend/cmm_helpers.mli
+++ b/backend/cmm_helpers.mli
@@ -210,6 +210,17 @@ val box_float : Debuginfo.t -> Lambda.alloc_mode -> expression -> expression
 
 val unbox_float : Debuginfo.t -> expression -> expression
 
+(** Vector boxing and unboxing *)
+val box_vector :
+  Debuginfo.t ->
+  Primitive.boxed_vector ->
+  Lambda.alloc_mode ->
+  expression ->
+  expression
+
+val unbox_vector :
+  Debuginfo.t -> Primitive.boxed_vector -> expression -> expression
+
 (** Complex number creation and access *)
 val box_complex : Debuginfo.t -> expression -> expression -> expression
 
@@ -372,6 +383,7 @@ module Extended_machtype_component : sig
     | Tagged_int
     | Any_int
     | Float
+    | Vec128
 end
 
 module Extended_machtype : sig
@@ -389,6 +401,8 @@ module Extended_machtype : sig
 
   val typ_void : t
 
+  val typ_vec128 : t
+
   (** Conversion from a normal Cmm machtype. *)
   val of_machtype : machtype -> t
 
@@ -942,6 +956,9 @@ val emit_int64_constant : symbol -> int64 -> data_item list -> data_item list
 val emit_nativeint_constant :
   symbol -> nativeint -> data_item list -> data_item list
 
+val emit_vec128_constant :
+  symbol -> Cmm.vec128_bits -> data_item list -> data_item list
+
 val emit_float_array_constant :
   symbol -> float list -> data_item list -> data_item list
 
@@ -988,6 +1005,9 @@ val int32 : dbg:Debuginfo.t -> int32 -> expression
 (** Create a constant int expression from an int64. *)
 val int64 : dbg:Debuginfo.t -> int64 -> expression
 
+(** Create a constant vec128 expression from two int64s. *)
+val vec128 : dbg:Debuginfo.t -> Cmm.vec128_bits -> expression
+
 (** Create a constant int expression from a nativeint. *)
 val nativeint : dbg:Debuginfo.t -> Nativeint.t -> expression
 
@@ -1233,6 +1253,9 @@ val cint : nativeint -> data_item
 (** Static float. *)
 val cfloat : float -> data_item
 
+(** Static 128-bit vector. *)
+val cvec128 : Cmm.vec128_bits -> data_item
+
 (** Static symbol. *)
 val symbol_address : symbol -> data_item
 
diff --git a/backend/cmm_invariants.ml b/backend/cmm_invariants.ml
index 527ca24eb79..4eed664912e 100644
--- a/backend/cmm_invariants.ml
+++ b/backend/cmm_invariants.ml
@@ -127,7 +127,7 @@ end
 
 let rec check env (expr : Cmm.expression) =
   match expr with
-  | Cconst_int _ | Cconst_natint _ | Cconst_float _ | Cconst_symbol _
+  | Cconst_int _ | Cconst_natint _ | Cconst_float _ | Cconst_symbol _ | Cconst_vec128 _
   | Cvar _ ->
     ()
   | Clet (_, expr, body)
diff --git a/backend/cmmgen.ml b/backend/cmmgen.ml
index b7851224d48..648bb0ebb84 100644
--- a/backend/cmmgen.ml
+++ b/backend/cmmgen.ml
@@ -39,6 +39,7 @@ open Cmm_builtins
 type boxed_number =
   | Boxed_float of alloc_mode * Debuginfo.t
   | Boxed_integer of boxed_integer * alloc_mode * Debuginfo.t
+  | Boxed_vector of boxed_vector * alloc_mode * Debuginfo.t
 
 type env = {
   unboxed_ids : (V.t * boxed_number) V.tbl;
@@ -162,6 +163,7 @@ let get_field env layout ptr n dbg =
     | Pvalue Pintval | Punboxed_int _ -> Word_int
     | Pvalue _ -> Word_val
     | Punboxed_float -> Double
+    | Punboxed_vector Pvec128 -> Onetwentyeight
     | Ptop ->
         Misc.fatal_errorf "get_field with Ptop: %a" Debuginfo.print_compact dbg
     | Pbottom ->
@@ -274,6 +276,8 @@ let emit_structured_constant symb cst cont =
       emit_int64_constant symb n cont
   | Uconst_nativeint n ->
       emit_nativeint_constant symb n cont
+  | Uconst_vec128 {high; low} ->
+      emit_vec128_constant symb {high; low} cont
   | Uconst_block (tag, csts) ->
       let cont = List.fold_right emit_constant csts cont in
       emit_block symb (block_header tag (List.length csts)) cont
@@ -319,6 +323,7 @@ let typ_of_boxed_number = function
   | Boxed_float _ -> Cmm.typ_float
   | Boxed_integer (Pint64, _,_) when size_int = 4 -> [|Int;Int|]
   | Boxed_integer _ -> Cmm.typ_int
+  | Boxed_vector (Pvec128, _, _) -> Cmm.typ_vec128
 
 let equal_unboxed_integer ui1 ui2 =
   match ui1, ui2 with
@@ -338,6 +343,7 @@ let box_number bn arg =
   match bn with
   | Boxed_float (m, dbg) -> box_float dbg m arg
   | Boxed_integer (bi, m, dbg) -> box_int dbg bi m arg
+  | Boxed_vector (vi, m, dbg) -> box_vector dbg vi m arg
 
 (* Returns the unboxed representation of a boxed float or integer.
    For Pint32 on 64-bit archs, the high 32 bits of the result are undefined. *)
@@ -349,6 +355,8 @@ let unbox_number dbg bn arg =
     low_32 dbg (unbox_int dbg Pint32 arg)
   | Boxed_integer (bi, _, _) ->
     unbox_int dbg bi arg
+  | Boxed_vector (vi, _, _) ->
+    unbox_vector dbg vi arg
 
 (* Auxiliary functions for optimizing "let" of boxed numbers (floats and
    boxed integers *)
@@ -380,7 +388,7 @@ let join_unboxed_number_kind ~strict k1 k2 =
   | _, _ -> No_unboxing
 
 let is_strict : kind_for_unboxing -> bool = function
-  | Boxed_integer _ | Boxed_float -> false
+  | Boxed_integer _ | Boxed_float | Boxed_vector _ -> false
   | Any -> true
 
 let rec is_unboxed_number_cmm = function
@@ -414,6 +422,8 @@ let rec is_unboxed_number_cmm = function
           Boxed (Boxed_integer (Pint32, alloc_heap, Debuginfo.none), true)
         | Some (Uconst_int64 _) ->
           Boxed (Boxed_integer (Pint64, alloc_heap, Debuginfo.none), true)
+        | Some (Uconst_vec128 _) ->
+          Boxed (Boxed_vector (Pvec128, alloc_heap, Debuginfo.none), true)
         | _ ->
           No_unboxing
       end
@@ -424,6 +434,7 @@ let rec is_unboxed_number_cmm = function
     | Cconst_int _
     | Cconst_natint _
     | Cconst_float _
+    | Cconst_vec128 _
     | Cvar _
     | Cassign _
     | Ctuple _
@@ -911,6 +922,12 @@ and transl_ccall env prim args dbg =
           | Pint32 -> XInt32
           | Pint64 -> XInt64 in
         (xty, transl_unbox_int dbg env bi arg)
+        | Unboxed_vector bi ->
+          let xty =
+            match bi with
+            | Pvec128 -> XVec128
+          in
+          (xty, transl_unbox_vector dbg env bi arg)
     | Untagged_int ->
         (XInt, untag_int (transl env arg) dbg)
   in
@@ -939,6 +956,7 @@ and transl_ccall env prim args dbg =
     | _, Unboxed_integer Pint64 when size_int = 4 ->
         ([|Int; Int|], box_int dbg Pint64 alloc_heap)
     | _, Unboxed_integer bi -> (typ_int, box_int dbg bi alloc_heap)
+    | _, Unboxed_vector Pvec128 -> (typ_vec128, box_vector dbg Pvec128 alloc_heap)
     | _, Untagged_int -> (typ_int, (fun i -> tag_int i dbg))
   in
   let typ_args, args = transl_args prim.prim_native_repr_args args in
@@ -1283,6 +1301,9 @@ and transl_unbox_float dbg env exp =
 and transl_unbox_int dbg env bi exp =
   unbox_int dbg bi (transl env exp)
 
+and transl_unbox_vector dbg env bi exp =
+  unbox_vector dbg bi (transl env exp)
+
 (* transl_unbox_int, but may return garbage in upper bits *)
 and transl_unbox_int_low dbg env bi e =
   let e = transl_unbox_int dbg env bi e in
@@ -1346,7 +1367,7 @@ and transl_let env str (layout : Lambda.layout) id exp transl_body =
        there may be constant closures inside that need lifting out. *)
     let _cbody : expression = transl_body env in
     cexp
-  | Punboxed_float | Punboxed_int _ -> begin
+  | Punboxed_float | Punboxed_int _ | Punboxed_vector _ -> begin
       let cexp = transl env exp in
       let cbody = transl_body env in
       match str with
diff --git a/backend/coloring.ml b/backend/coloring.ml
index 9325289d443..46f55697efb 100644
--- a/backend/coloring.ml
+++ b/backend/coloring.ml
@@ -44,27 +44,28 @@ let allocate_registers() =
   let unconstrained = ref [] in
 
   (* Reset the stack slot counts *)
-  let num_stack_slots = Array.make Proc.num_register_classes 0 in
+  let num_stack_slots = Array.make Proc.num_stack_slot_classes 0 in
 
   (* Preallocate the spilled registers in the stack.
      Split the remaining registers into constrained and unconstrained. *)
   let remove_reg reg =
     let cl = Proc.register_class reg in
+    let stack_cl = Proc.stack_slot_class reg.typ in
     if reg.spill then begin
       (* Preallocate the registers in the stack *)
-      let nslots = num_stack_slots.(cl) in
+      let nslots = num_stack_slots.(stack_cl) in
       let conflict = Array.make nslots false in
       List.iter
         (fun r ->
           match r.loc with
             Stack(Local n) ->
-              if Proc.register_class r = cl then conflict.(n) <- true
+              if Proc.stack_slot_class r.typ = stack_cl then conflict.(n) <- true
           | _ -> ())
         reg.interf;
       let slot = ref 0 in
       while !slot < nslots && conflict.(!slot) do incr slot done;
       reg.loc <- Stack(Local !slot);
-      if !slot >= nslots then num_stack_slots.(cl) <- !slot + 1
+      if !slot >= nslots then num_stack_slots.(stack_cl) <- !slot + 1
     end else if reg.degree < Proc.num_available_registers.(cl) then
       unconstrained := reg :: !unconstrained
     else begin
@@ -90,6 +91,7 @@ let allocate_registers() =
   (* Assign a location to a register, the best we can. *)
   let assign_location reg =
     let cl = Proc.register_class reg in
+    let stack_cl = Proc.stack_slot_class reg.typ in
     let first_reg = Proc.first_available_register.(cl) in
     let num_regs = Proc.num_available_registers.(cl) in
     let score = Array.make num_regs 0 in
@@ -161,7 +163,7 @@ let allocate_registers() =
                                 if start >= num_regs then 0 else start)
     end else begin
       (* Sorry, we must put the pseudoreg in a stack location *)
-      let nslots = num_stack_slots.(cl) in
+      let nslots = num_stack_slots.(stack_cl) in
       let score = Array.make nslots 0 in
       (* Compute the scores as for registers *)
       List.iter
@@ -207,7 +209,7 @@ let allocate_registers() =
       else begin
         (* Allocate a new stack slot *)
         reg.loc <- Stack(Local nslots);
-        num_stack_slots.(cl) <- nslots + 1
+        num_stack_slots.(stack_cl) <- nslots + 1
       end
     end;
     (* Cancel the preferences of this register so that they don't influence
diff --git a/backend/comballoc.ml b/backend/comballoc.ml
index 5275f0a9490..d67b8652879 100644
--- a/backend/comballoc.ml
+++ b/backend/comballoc.ml
@@ -87,9 +87,8 @@ let rec combine i allocstate =
     end
   | Iop((Imove|Ispill|Ireload|Inegf|Iabsf|Iaddf|Isubf|Imulf|Idivf|Ifloatofint|
          Iintoffloat|Ivalueofint|Iintofvalue|Iopaque|Iconst_int _|Iconst_float _|
-         Iconst_symbol _|Istackoffset _|Iload (_, _, _)|Istore (_, _, _)|Icompf _|
-         Icsel _ |
-         Ispecific _|Iname_for_debugger _|Iprobe_is_enabled _))
+         Iconst_vec128 _|Iconst_symbol _|Istackoffset _|Iload (_, _, _)|Istore (_, _, _)|
+         Icompf _|Icsel _ |Ispecific _|Iname_for_debugger _|Iprobe_is_enabled _))
   | Iop(Iintop(Iadd | Isub | Imul | Idiv | Imod | Iand | Ior | Ixor
               | Ilsl | Ilsr | Iasr | Ipopcnt | Imulh _
               | Iclz _ | Ictz _ | Icomp _))
diff --git a/backend/debug/available_regs.ml b/backend/debug/available_regs.ml
index b792fa48c7a..7ea2d378849 100644
--- a/backend/debug/available_regs.ml
+++ b/backend/debug/available_regs.ml
@@ -184,6 +184,7 @@ let rec available_regs (instr : M.instruction) ~(avail_before : RAS.t) : RAS.t =
           else
             RD.Set.made_unavailable_by_clobber avail_before
               ~regs_clobbered:instr.res ~register_class:Proc.register_class
+              ~stack_class:(fun r -> Proc.stack_slot_class r.typ)
         in
         let results =
           Array.map2
@@ -209,7 +210,8 @@ let rec available_regs (instr : M.instruction) ~(avail_before : RAS.t) : RAS.t =
             Array.append (Proc.destroyed_at_oper instr.desc) instr.res
           in
           RD.Set.made_unavailable_by_clobber avail_before ~regs_clobbered
-            ~register_class:Proc.register_class
+            ~register_class:Proc.register_class ~stack_class:(fun r ->
+              Proc.stack_slot_class r.typ)
         in
         (* Second: the cases of (a) allocations and (b) OCaml to OCaml function
            calls. In these cases, since the GC may run, registers always become
diff --git a/backend/debug/reg_with_debug_info.ml b/backend/debug/reg_with_debug_info.ml
index 9d36d9be824..6da74b6f615 100644
--- a/backend/debug/reg_with_debug_info.ml
+++ b/backend/debug/reg_with_debug_info.ml
@@ -99,23 +99,29 @@ let reg t = t.reg
 let location t = t.reg.loc
 
 let holds_pointer t =
-  match t.reg.typ with Addr | Val -> true | Int | Float -> false
+  match t.reg.typ with Addr | Val -> true | Int | Float | Vec128 -> false
 
 let holds_non_pointer t = not (holds_pointer t)
 
 let assigned_to_stack t =
   match t.reg.loc with Stack _ -> true | Reg _ | Unknown -> false
 
-let regs_at_same_location (reg1 : Reg.t) (reg2 : Reg.t) ~register_class =
+let regs_at_same_location (reg1 : Reg.t) (reg2 : Reg.t) ~register_class
+    ~stack_class =
   (* We need to check the register classes too: two locations both saying "stack
      offset N" might actually be different physical locations, for example if
      one is of class "Int" and another "Float" on amd64. [register_class] will
      be [Proc.register_class], but cannot be here, due to a circular
      dependency. *)
-  reg1.loc = reg2.loc && register_class reg1 = register_class reg2
+  reg1.loc = reg2.loc
+  &&
+  match reg1.loc with
+  | Reg _ -> register_class reg1 = register_class reg2
+  | Stack _ -> stack_class reg1 = stack_class reg2
+  | Unknown -> Misc.fatal_errorf "regs_at_same_location got Unknown locations"
 
-let at_same_location t (reg : Reg.t) ~register_class =
-  regs_at_same_location t.reg reg ~register_class
+let at_same_location t (reg : Reg.t) ~register_class ~stack_class =
+  regs_at_same_location t.reg reg ~register_class ~stack_class
 
 let debug_info t = t.debug_info
 
@@ -152,12 +158,14 @@ module Set = struct
       (fun reg acc -> add (create_without_debug_info ~reg) acc)
       regs empty
 
-  let made_unavailable_by_clobber t ~regs_clobbered ~register_class =
+  let made_unavailable_by_clobber t ~regs_clobbered ~register_class ~stack_class
+      =
     Reg.Set.fold
       (fun reg acc ->
         let made_unavailable =
           filter
-            (fun reg' -> regs_at_same_location reg'.reg reg ~register_class)
+            (fun reg' ->
+              regs_at_same_location reg'.reg reg ~register_class ~stack_class)
             t
         in
         union made_unavailable acc)
diff --git a/backend/debug/reg_with_debug_info.mli b/backend/debug/reg_with_debug_info.mli
index a26d1e12359..56619dd597f 100644
--- a/backend/debug/reg_with_debug_info.mli
+++ b/backend/debug/reg_with_debug_info.mli
@@ -63,7 +63,12 @@ val debug_info : t -> Debug_info.t option
     (physical or pseudoregister) location as the register [reg], which is not
     equipped with debugging information. [register_class] should be
     [Proc.register_class]. *)
-val at_same_location : t -> Reg.t -> register_class:(Reg.t -> int) -> bool
+val at_same_location :
+  t ->
+  Reg.t ->
+  register_class:(Reg.t -> int) ->
+  stack_class:(Reg.t -> int) ->
+  bool
 
 val holds_pointer : t -> bool
 
@@ -98,7 +103,11 @@ module Set : sig
       registers in [regs_clobbered]. (Think of [t] as a set of available
       registers.) [register_class] should always be [Proc.register_class]. *)
   val made_unavailable_by_clobber :
-    t -> regs_clobbered:Reg.t array -> register_class:(Reg.t -> int) -> t
+    t ->
+    regs_clobbered:Reg.t array ->
+    register_class:(Reg.t -> int) ->
+    stack_class:(Reg.t -> int) ->
+    t
 end
 
 val print :
diff --git a/backend/interf.ml b/backend/interf.ml
index c34eaa0a867..6436e344bf1 100644
--- a/backend/interf.ml
+++ b/backend/interf.ml
@@ -114,7 +114,7 @@ let build_graph fundecl =
     | Iexit _ ->
         ()
     | Itrywith(body, _kind, (_ts, handler)) ->
-        add_interf_set Proc.destroyed_at_raise handler.live;
+        add_interf_set (Proc.destroyed_at_raise ()) handler.live;
         interf body; interf handler; interf i.next
     | Iraise _ -> () in
 
diff --git a/backend/interval.ml b/backend/interval.ml
index 1546e39effa..4955ba7b812 100644
--- a/backend/interval.ml
+++ b/backend/interval.ml
@@ -108,7 +108,7 @@ let insert_destroyed_at_oper intervals instr pos =
     update_interval_position_by_array intervals destroyed pos Result
 
 let insert_destroyed_at_raise intervals pos =
-  let destroyed = Proc.destroyed_at_raise in
+  let destroyed = Proc.destroyed_at_raise () in
   if Array.length destroyed > 0 then
     update_interval_position_by_array intervals destroyed pos Result
 
diff --git a/backend/linscan.ml b/backend/linscan.ml
index 21416be23d7..802779f4298 100644
--- a/backend/linscan.ml
+++ b/backend/linscan.ml
@@ -72,7 +72,7 @@ let rec release_expired_inactive ci pos = function
 (* Allocate a new stack slot to the interval. *)
 
 let allocate_stack_slot num_stack_slots i =
-  let cl = Proc.register_class i.reg in
+  let cl = Proc.stack_slot_class i.reg.typ in
   let ss = num_stack_slots.(cl) in
   num_stack_slots.(cl) <- succ ss;
   i.reg.loc <- Stack(Local ss);
@@ -189,7 +189,7 @@ let allocate_registers() =
     };
   done;
   (* Reset the stack slot counts *)
-  let num_stack_slots = Array.make Proc.num_register_classes 0 in
+  let num_stack_slots = Array.make Proc.num_stack_slot_classes 0 in
   (* Add all fixed intervals (sorted by end position) *)
   List.iter
     (fun i ->
diff --git a/backend/mach.ml b/backend/mach.ml
index 9b62441e9aa..fe5a5dce0d6 100644
--- a/backend/mach.ml
+++ b/backend/mach.ml
@@ -52,6 +52,7 @@ type operation =
   | Ireload
   | Iconst_int of nativeint
   | Iconst_float of int64
+  | Iconst_vec128 of Cmm.vec128_bits
   | Iconst_symbol of Cmm.symbol
   | Icall_ind
   | Icall_imm of { func : Cmm.symbol; }
@@ -175,7 +176,7 @@ let rec instr_iter f i =
           instr_iter f body; instr_iter f handler; instr_iter f i.next
       | Iraise _ -> ()
       | Iop (Imove | Ispill | Ireload
-            | Iconst_int _ | Iconst_float _ | Iconst_symbol _
+            | Iconst_int _ | Iconst_float _ | Iconst_symbol _ | Iconst_vec128 _
             | Icall_ind | Icall_imm _ | Iextcall _ | Istackoffset _
             | Iload _ | Istore _ | Ialloc _
             | Iintop _ | Iintop_imm _ | Iintop_atomic _
@@ -206,7 +207,7 @@ let operation_is_pure = function
   | Icompf _
   | Icsel _
   | Ifloatofint | Iintoffloat
-  | Iconst_int _ | Iconst_float _ | Iconst_symbol _
+  | Iconst_int _ | Iconst_float _ | Iconst_symbol _ | Iconst_vec128 _
   | Iload (_, _, _) | Iname_for_debugger _
     -> true
 
@@ -226,7 +227,7 @@ let operation_can_raise op =
   | Icompf _
   | Icsel _
   | Ifloatofint | Iintoffloat | Ivalueofint | Iintofvalue
-  | Iconst_int _ | Iconst_float _ | Iconst_symbol _
+  | Iconst_int _ | Iconst_float _ | Iconst_symbol _ | Iconst_vec128 _
   | Istackoffset _ | Istore _  | Iload (_, _, _) | Iname_for_debugger _
   | Itailcall_imm _ | Itailcall_ind
   | Iopaque | Ibeginregion | Iendregion
diff --git a/backend/mach.mli b/backend/mach.mli
index ba68d8c0373..83b5c81315b 100644
--- a/backend/mach.mli
+++ b/backend/mach.mli
@@ -55,6 +55,7 @@ type operation =
   | Ireload
   | Iconst_int of nativeint
   | Iconst_float of int64
+  | Iconst_vec128 of Cmm.vec128_bits
   | Iconst_symbol of Cmm.symbol
   | Icall_ind
   | Icall_imm of { func : Cmm.symbol; }
diff --git a/backend/polling.ml b/backend/polling.ml
index c496d4bef5a..59983584e1e 100644
--- a/backend/polling.ml
+++ b/backend/polling.ml
@@ -241,7 +241,7 @@ let find_poll_alloc_or_calls instr =
       | Iop(Icall_ind | Icall_imm _ |
             Itailcall_ind | Itailcall_imm _ ) -> Some (Function_call, i.dbg)
       | Iop(Iextcall { alloc = true }) -> Some (External_call, i.dbg)
-      | Iop(Imove | Ispill | Ireload | Iconst_int _ | Iconst_float _ |
+      | Iop(Imove | Ispill | Ireload | Iconst_int _ | Iconst_float _ | Iconst_vec128 _ |
             Iconst_symbol _ | Iextcall { alloc = false } | Istackoffset _ |
             Iload _ | Istore _ | Iintop _ | Iintop_imm _ | Iintop_atomic _ |
             Ifloatofint | Iintoffloat | Inegf | Iabsf | Iaddf | Isubf |
diff --git a/backend/printcmm.ml b/backend/printcmm.ml
index 8b4eae9bdc8..85b2f7ff65b 100644
--- a/backend/printcmm.ml
+++ b/backend/printcmm.ml
@@ -34,6 +34,7 @@ let machtype_component ppf = function
   | Addr -> fprintf ppf "addr"
   | Int -> fprintf ppf "int"
   | Float -> fprintf ppf "float"
+  | Vec128 -> fprintf ppf "vec128"
 
 let machtype ppf mty =
   match Array.length mty with
@@ -48,6 +49,7 @@ let exttype ppf = function
   | XInt32 -> fprintf ppf "int32"
   | XInt64 -> fprintf ppf "int64"
   | XFloat -> fprintf ppf "float"
+  | XVec128 -> fprintf ppf "vec128"
 
 let extcall_signature ppf (ty_res, ty_args) =
   begin match ty_args with
@@ -97,6 +99,7 @@ let chunk = function
   | Sixteen_signed -> "signed int16"
   | Thirtytwo_unsigned -> "unsigned int32"
   | Thirtytwo_signed -> "signed int32"
+  | Onetwentyeight -> "vec128"
   | Word_int -> "int"
   | Word_val -> "val"
   | Single -> "float32"
@@ -244,6 +247,7 @@ let rec expr ppf = function
   | Cconst_int (n, _dbg) -> fprintf ppf "%i" n
   | Cconst_natint (n, _dbg) ->
     fprintf ppf "%s" (Nativeint.to_string n)
+  | Cconst_vec128 ({low; high}, _dbg) -> fprintf ppf "%016Lx:%016Lx" high low
   | Cconst_float (n, _dbg) -> fprintf ppf "%F" n
   | Cconst_symbol (s, _dbg) -> fprintf ppf "%a:\"%s\"" is_global s.sym_global s.sym_name
   | Cvar id -> V.print ppf id
@@ -411,6 +415,8 @@ let data_item ppf = function
   | Cint n -> fprintf ppf "int %s" (Nativeint.to_string n)
   | Csingle f -> fprintf ppf "single %F" f
   | Cdouble f -> fprintf ppf "double %F" f
+  | Cvec128 {high; low} ->
+    fprintf ppf "vec128 %s:%s" (Int64.to_string high) (Int64.to_string low)
   | Csymbol_address s -> fprintf ppf "addr %a:\"%s\"" is_global s.sym_global s.sym_name
   | Cstring s -> fprintf ppf "string \"%s\"" s
   | Cskip n -> fprintf ppf "skip %i" n
diff --git a/backend/printmach.ml b/backend/printmach.ml
index 7ffc5457a16..e7306848c7a 100644
--- a/backend/printmach.ml
+++ b/backend/printmach.ml
@@ -23,14 +23,14 @@ open Interval
 
 module V = Backend_var
 
-let loc ?(wrap_out = fun ppf f -> f ppf) ~reg_class ~unknown ppf l =
-  match l with
+let loc ?(wrap_out = fun ppf f -> f ppf) ~unknown ppf loc typ =
+  match loc with
   | Unknown -> unknown ppf
   | Reg r ->
-      wrap_out ppf (fun ppf -> fprintf ppf "%s" (Proc.register_name r))
+      wrap_out ppf (fun ppf -> fprintf ppf "%s" (Proc.register_name typ r))
   | Stack(Local s) ->
       wrap_out ppf (fun ppf ->
-        fprintf ppf "s[%s:%i]" (Proc.register_class_tag reg_class) s)
+        fprintf ppf "s[%s:%i]" (Proc.stack_class_tag (Proc.stack_slot_class typ)) s)
   | Stack(Incoming s) ->
       wrap_out ppf (fun ppf -> fprintf ppf "par[%i]" s)
   | Stack(Outgoing s) ->
@@ -43,11 +43,16 @@ let reg ppf r =
     fprintf ppf "%s" (Reg.name r)
   else
     fprintf ppf "%s"
-      (match r.typ with Val -> "V" | Addr -> "A" | Int -> "I" | Float -> "F");
+      (match r.typ with
+      | Val -> "V"
+      | Addr -> "A"
+      | Int -> "I"
+      | Float -> "F"
+      | Vec128 -> "X");
   fprintf ppf "/%i" r.stamp;
   loc
     ~wrap_out:(fun ppf f -> fprintf ppf "[%t]" f)
-    ~reg_class:(Proc.register_class r) ~unknown:(fun _ -> ()) ppf r.loc
+    ~unknown:(fun _ -> ()) ppf r.loc r.typ
 
 let regs' ?(print_reg = reg) ppf v =
   let reg = print_reg in
@@ -159,6 +164,7 @@ let operation' ?(print_reg = reg) op arg ppf res =
   | Iconst_int n -> fprintf ppf "%s" (Nativeint.to_string n)
   | Iconst_float f -> fprintf ppf "%F" (Int64.float_of_bits f)
   | Iconst_symbol s -> fprintf ppf "\"%s\"" s.sym_name
+  | Iconst_vec128 {high; low} -> fprintf ppf "%016Lx:%016Lx" high low
   | Icall_ind -> fprintf ppf "call %a" regs arg
   | Icall_imm { func; } -> fprintf ppf "call \"%s\" %a" func.sym_name regs arg
   | Itailcall_ind -> fprintf ppf "tailcall %a" regs arg
diff --git a/backend/printmach.mli b/backend/printmach.mli
index 1f0672230b2..c4086611157 100644
--- a/backend/printmach.mli
+++ b/backend/printmach.mli
@@ -17,7 +17,7 @@
 
 open Format
 
-val loc: ?wrap_out:(formatter -> (formatter -> unit) -> unit) -> reg_class:int -> unknown:(formatter -> unit) -> formatter -> Reg.location -> unit
+val loc: ?wrap_out:(formatter -> (formatter -> unit) -> unit) -> unknown:(formatter -> unit) -> formatter -> Reg.location -> Cmm.machtype_component -> unit
 val reg: formatter -> Reg.t -> unit
 val regs': ?print_reg:(formatter -> Reg.t -> unit) -> formatter -> Reg.t array -> unit
 val regs: formatter -> Reg.t array -> unit
diff --git a/backend/proc.mli b/backend/proc.mli
index ba6edc08d1a..f541a790cc4 100644
--- a/backend/proc.mli
+++ b/backend/proc.mli
@@ -21,13 +21,19 @@ val word_addressed: bool
 (* Registers available for register allocation *)
 val num_register_classes: int
 val register_class: Reg.t -> int
-val register_class_tag: int -> string
 val num_available_registers: int array
 val first_available_register: int array
-val register_name: int -> string
-val phys_reg: int -> Reg.t
+val register_name: Cmm.machtype_component -> int -> string
+val phys_reg: Cmm.machtype_component -> int -> Reg.t
 val rotate_registers: bool
-val all_phys_regs : Reg.t array
+val precolored_regs : unit -> Reg.Set.t
+
+(* The number of stack slot classes may differ from the number of register classes.
+   On x86, we use the same class for floating point and SIMD vector registers,
+   but they take up different amounts of space on the stack. *)
+val num_stack_slot_classes: int
+val stack_slot_class: Cmm.machtype_component -> int
+val stack_class_tag: int -> string
 
 (* Calling conventions *)
 val loc_arguments: Cmm.machtype -> Reg.t array * int
@@ -55,7 +61,7 @@ val max_register_pressure: Mach.operation -> int array
 
 (* Registers destroyed by operations *)
 val destroyed_at_oper: Mach.instruction_desc -> Reg.t array
-val destroyed_at_raise: Reg.t array
+val destroyed_at_raise: unit -> Reg.t array
 val destroyed_at_reloadretaddr : Reg.t array
 val destroyed_at_pushtrap : Reg.t array
 val destroyed_at_basic : Cfg_intf.S.basic -> Reg.t array
diff --git a/backend/reg.ml b/backend/reg.ml
index 1b302a9e765..6058f3e9a36 100644
--- a/backend/reg.ml
+++ b/backend/reg.ml
@@ -195,6 +195,7 @@ let is_reg t =
 
 let size_of_contents_in_bytes t =
   match t.typ with
+  | Vec128 -> Arch.size_vec128
   | Float -> Arch.size_float
   | Addr ->
     assert (Arch.size_addr = Arch.size_int);
diff --git a/backend/regalloc/regalloc_invariants.ml b/backend/regalloc/regalloc_invariants.ml
index fc2daefdd99..7ea23b64233 100644
--- a/backend/regalloc/regalloc_invariants.ml
+++ b/backend/regalloc/regalloc_invariants.ml
@@ -17,6 +17,7 @@ let precondition : Cfg_with_layout.t -> unit =
       | Const_int _ -> ()
       | Const_float _ -> ()
       | Const_symbol _ -> ()
+      | Const_vec128 _ -> ()
       | Stackoffset _ -> ()
       | Load _ -> ()
       | Store _ -> ()
@@ -100,9 +101,9 @@ let precondition : Cfg_with_layout.t -> unit =
   let fun_num_stack_slots =
     (Cfg_with_layout.cfg cfg_with_layout).fun_num_stack_slots
   in
-  Array.iteri fun_num_stack_slots ~f:(fun reg_class num_slots ->
+  Array.iteri fun_num_stack_slots ~f:(fun stack_class num_slots ->
       if num_slots <> 0
-      then fatal "register class %d has %d slots(s)" reg_class num_slots)
+      then fatal "stack slot class %d has %d slots(s)" stack_class num_slots)
 
 let postcondition_layout : Cfg_with_layout.t -> unit =
  fun cfg_with_layout ->
@@ -145,13 +146,15 @@ let postcondition_layout : Cfg_with_layout.t -> unit =
   let register_classes_must_be_consistent (id : Instruction.id) (reg : Reg.t) :
       unit =
     match reg.Reg.loc with
-    | Reg phys_reg ->
-      let phys_reg = Proc.phys_reg phys_reg in
-      if not (same_reg_class reg phys_reg)
-      then
+    | Reg phys_reg -> (
+      try
+        let (_ : Reg.t) = Proc.phys_reg reg.typ phys_reg in
+        ()
+      with Invalid_argument _ ->
         fatal
-          "instruction %d assigned %a to %a but they are in different classes"
-          id Printmach.reg reg Printmach.reg phys_reg
+          "instruction %d assigned %a to register %i, which has an \
+           incompatible class"
+          id Printmach.reg reg phys_reg)
     | Stack _ | Unknown -> ()
   in
   let register_classes_must_be_consistent (id : Instruction.id)
@@ -160,7 +163,7 @@ let postcondition_layout : Cfg_with_layout.t -> unit =
   in
   let module Int = Numbers.Int in
   let used_stack_slots =
-    Array.init Proc.num_register_classes ~f:(fun _ -> Int.Set.empty)
+    Array.init Proc.num_stack_slot_classes ~f:(fun _ -> Int.Set.empty)
   in
   let record_stack_slot_use (reg : Reg.t) : unit =
     match reg.loc with
@@ -169,9 +172,9 @@ let postcondition_layout : Cfg_with_layout.t -> unit =
     | Stack stack_loc -> (
       match stack_loc with
       | Local index ->
-        let reg_class = Proc.register_class reg in
-        used_stack_slots.(reg_class)
-          <- Int.Set.add index used_stack_slots.(reg_class)
+        let stack_class = Proc.stack_slot_class reg.typ in
+        used_stack_slots.(stack_class)
+          <- Int.Set.add index used_stack_slots.(stack_class)
       | Incoming _ -> ()
       | Outgoing _ -> ()
       | Domainstate _ -> ())
@@ -203,7 +206,7 @@ let postcondition_layout : Cfg_with_layout.t -> unit =
   let fun_num_stack_slots =
     (Cfg_with_layout.cfg cfg_with_layout).fun_num_stack_slots
   in
-  Array.iteri fun_num_stack_slots ~f:(fun reg_class num_slots ->
+  Array.iteri fun_num_stack_slots ~f:(fun stack_class num_slots ->
       let available_slots =
         Seq.ints 0 |> Seq.take num_slots |> Int.Set.of_seq
       in
@@ -211,16 +214,20 @@ let postcondition_layout : Cfg_with_layout.t -> unit =
         set |> Int.Set.elements |> List.map ~f:string_of_int
         |> String.concat ", "
       in
-      let invalid = Int.Set.diff used_stack_slots.(reg_class) available_slots in
+      let invalid =
+        Int.Set.diff used_stack_slots.(stack_class) available_slots
+      in
       if not (Int.Set.is_empty invalid)
       then
-        fatal "register class %d uses the following invalid slots: %s" reg_class
-          (string_of_set invalid);
-      let unused = Int.Set.diff available_slots used_stack_slots.(reg_class) in
+        fatal "stack slot class %d uses the following invalid slots: %s"
+          stack_class (string_of_set invalid);
+      let unused =
+        Int.Set.diff available_slots used_stack_slots.(stack_class)
+      in
       if not (Int.Set.is_empty unused)
       then
-        fatal "register class %d has the following unused slots: %s" reg_class
-          (string_of_set unused))
+        fatal "stack slot class %d has the following unused slots: %s"
+          stack_class (string_of_set unused))
 
 let postcondition_liveness : Cfg_with_infos.t -> unit =
  fun cfg_with_infos ->
diff --git a/backend/regalloc/regalloc_irc.ml b/backend/regalloc/regalloc_irc.ml
index fc925c9f653..84e5b70f095 100644
--- a/backend/regalloc/regalloc_irc.ml
+++ b/backend/regalloc/regalloc_irc.ml
@@ -89,8 +89,9 @@ let build : State.t -> Cfg_with_infos.t -> unit =
         let live = Cfg_dataflow.Instr.Tbl.find liveness first_id in
         Reg.Set.iter
           (fun reg1 ->
-            Array.iter (filter_fp Proc.destroyed_at_raise) ~f:(fun reg2 ->
-                State.add_edge state reg1 reg2))
+            Array.iter
+              (filter_fp (Proc.destroyed_at_raise ()))
+              ~f:(fun reg2 -> State.add_edge state reg1 reg2))
           (Reg.Set.remove Proc.loc_exn_bucket live.before))
 
 let make_work_list : State.t -> unit =
@@ -525,6 +526,6 @@ let run : Cfg_with_infos.t -> Cfg_with_infos.t =
     state
     ~f:(fun () ->
       update_register_locations ();
-      Array.iter all_precolored_regs ~f:(fun reg -> reg.Reg.degree <- 0))
+      Reg.Set.iter (fun reg -> reg.Reg.degree <- 0) (all_precolored_regs ()))
     cfg_with_infos;
   cfg_with_infos
diff --git a/backend/regalloc/regalloc_irc_state.ml b/backend/regalloc/regalloc_irc_state.ml
index d4f1ca80a40..abe423381aa 100644
--- a/backend/regalloc/regalloc_irc_state.ml
+++ b/backend/regalloc/regalloc_irc_state.ml
@@ -60,7 +60,8 @@ let[@inline] make ~initial ~stack_slots ~next_instruction_id () =
       reg.Reg.interf <- [];
       reg.Reg.degree <- 0);
   List.iter initial ~f:(fun reg -> reg.Reg.irc_work_list <- Initial);
-  Array.iter all_precolored_regs ~f:(fun reg ->
+  Reg.Set.iter
+    (fun reg ->
       reg.Reg.irc_work_list <- Reg.Precolored;
       reg.Reg.irc_color
         <- (match reg.Reg.loc with
@@ -70,7 +71,8 @@ let[@inline] make ~initial ~stack_slots ~next_instruction_id () =
                Printmach.reg reg);
       reg.Reg.irc_alias <- None;
       reg.Reg.interf <- [];
-      reg.Reg.degree <- Degree.infinite);
+      reg.Reg.degree <- Degree.infinite)
+    (all_precolored_regs ());
   let num_registers = List.length initial in
   let original_capacity = Int.min max_capacity num_registers in
   let simplify_work_list = RegWorkList.make ~original_capacity in
@@ -140,7 +142,8 @@ let[@inline] reset state ~new_temporaries =
       reg.Reg.irc_alias <- None;
       reg.Reg.interf <- [];
       reg.Reg.degree <- 0);
-  Array.iter all_precolored_regs ~f:(fun reg ->
+  Reg.Set.iter
+    (fun reg ->
       assert (reg.Reg.irc_work_list = Reg.Precolored);
       (match reg.Reg.loc, reg.Reg.irc_color with
       | Reg color, Some color' -> assert (color = color')
@@ -148,7 +151,8 @@ let[@inline] reset state ~new_temporaries =
       | (Unknown | Stack _), _ -> assert false);
       reg.Reg.irc_alias <- None;
       reg.Reg.interf <- [];
-      assert (reg.Reg.degree = Degree.infinite));
+      assert (reg.Reg.degree = Degree.infinite))
+    (all_precolored_regs ());
   state.initial <- Doubly_linked_list.of_list new_temporaries;
   Doubly_linked_list.transfer ~from:state.colored_nodes ~to_:state.initial ();
   RegWorkList.iter state.coalesced_nodes ~f:(fun reg ->
@@ -534,10 +538,10 @@ let[@inline] invariant state =
   then (
     (* interf (list) is morally a set *)
     List.iter (Reg.all_registers ()) ~f:check_inter_has_no_duplicates;
-    Array.iter all_precolored_regs ~f:check_inter_has_no_duplicates;
+    Reg.Set.iter check_inter_has_no_duplicates (all_precolored_regs ());
     (* register sets are disjoint *)
     check_disjoint ~is_disjoint:Reg.Set.disjoint
-      [ "precolored", Reg.set_of_array all_precolored_regs;
+      [ "precolored", all_precolored_regs ();
         "initial", reg_set_of_doubly_linked_list state.initial;
         "simplify_work_list", reg_set_of_reg_work_list state.simplify_work_list;
         "freeze_work_list", reg_set_of_reg_work_list state.freeze_work_list;
@@ -547,7 +551,7 @@ let[@inline] invariant state =
         "colored_nodes", reg_set_of_doubly_linked_list state.colored_nodes;
         "select_stack", Reg.Set.of_list state.select_stack ];
     List.iter ~f:check_set_and_field_consistency_reg
-      [ "precolored", Reg.set_of_array all_precolored_regs, Reg.Precolored;
+      [ "precolored", all_precolored_regs (), Reg.Precolored;
         "initial", reg_set_of_doubly_linked_list state.initial, Reg.Initial;
         ( "simplify_work_list",
           reg_set_of_reg_work_list state.simplify_work_list,
@@ -605,7 +609,7 @@ let[@inline] invariant state =
            (reg_set_of_reg_work_list state.spill_work_list))
     in
     let work_lists_or_precolored =
-      Reg.Set.union (Reg.set_of_array all_precolored_regs) work_lists
+      Reg.Set.union (all_precolored_regs ()) work_lists
     in
     Reg.Set.iter
       (fun u ->
diff --git a/backend/regalloc/regalloc_irc_utils.ml b/backend/regalloc/regalloc_irc_utils.ml
index 69ad7ab5fe4..886d7da68cc 100644
--- a/backend/regalloc/regalloc_irc_utils.ml
+++ b/backend/regalloc/regalloc_irc_utils.ml
@@ -111,6 +111,7 @@ let is_move_basic : Cfg.basic -> bool =
     | Const_int _ -> false
     | Const_float _ -> false
     | Const_symbol _ -> false
+    | Const_vec128 _ -> false
     | Stackoffset _ -> false
     | Load _ -> false
     | Store _ -> false
@@ -140,22 +141,9 @@ let is_move_basic : Cfg.basic -> bool =
 let is_move_instruction : Cfg.basic Cfg.instruction -> bool =
  fun instr -> is_move_basic instr.desc
 
-let all_precolored_regs : Reg.t array =
+let all_precolored_regs =
   Proc.init ();
-  let num_available_registers =
-    Array.fold_left Proc.num_available_registers ~f:( + ) ~init:0
-  in
-  let res = Array.make num_available_registers Reg.dummy in
-  let i = ref 0 in
-  for reg_class = 0 to pred Proc.num_register_classes do
-    let first_available_register = Proc.first_available_register.(reg_class) in
-    let num_available_registers = Proc.num_available_registers.(reg_class) in
-    for reg_idx = 0 to pred num_available_registers do
-      res.(!i) <- Proc.phys_reg (first_available_register + reg_idx);
-      incr i
-    done
-  done;
-  res
+  Proc.precolored_regs
 
 let k reg = Proc.num_available_registers.(Proc.register_class reg)
 
diff --git a/backend/regalloc/regalloc_irc_utils.mli b/backend/regalloc/regalloc_irc_utils.mli
index a2123413f85..5333ff04b89 100644
--- a/backend/regalloc/regalloc_irc_utils.mli
+++ b/backend/regalloc/regalloc_irc_utils.mli
@@ -64,7 +64,7 @@ end
 
 val is_move_instruction : Instruction.t -> bool
 
-val all_precolored_regs : Reg.t array
+val all_precolored_regs : unit -> Reg.Set.t
 
 val k : Reg.t -> int
 
diff --git a/backend/regalloc/regalloc_ls.ml b/backend/regalloc/regalloc_ls.ml
index 7c65e41bb88..7644bb4dce1 100644
--- a/backend/regalloc/regalloc_ls.ml
+++ b/backend/regalloc/regalloc_ls.ml
@@ -75,7 +75,7 @@ let build_intervals : State.t -> Cfg_with_infos.t -> unit =
     let off = on + 1 in
     if trap_handler
     then
-      Array.iter Proc.destroyed_at_raise ~f:(fun reg ->
+      Array.iter (Proc.destroyed_at_raise ()) ~f:(fun reg ->
           update_range reg ~begin_:on ~end_:on);
     instr.ls_order <- on;
     Array.iter instr.arg ~f:(fun reg -> update_range reg ~begin_:on ~end_:on);
diff --git a/backend/regalloc/regalloc_ls_state.ml b/backend/regalloc/regalloc_ls_state.ml
index 80fa3cbd3ad..f733614fde4 100644
--- a/backend/regalloc/regalloc_ls_state.ml
+++ b/backend/regalloc/regalloc_ls_state.ml
@@ -56,6 +56,8 @@ let[@inline] release_expired_intervals state ~pos =
 
 let[@inline] active state ~reg_class = state.active.(reg_class)
 
+let[@inline] active_classes state = state.active
+
 let[@inline] stack_slots state = state.stack_slots
 
 let[@inline] get_and_incr_instruction_id state =
diff --git a/backend/regalloc/regalloc_ls_state.mli b/backend/regalloc/regalloc_ls_state.mli
index 95e7646e175..da0b601817d 100644
--- a/backend/regalloc/regalloc_ls_state.mli
+++ b/backend/regalloc/regalloc_ls_state.mli
@@ -20,6 +20,8 @@ val release_expired_intervals : t -> pos:int -> unit
 
 val active : t -> reg_class:int -> ClassIntervals.t
 
+val active_classes : t -> ClassIntervals.t array
+
 val stack_slots : t -> Regalloc_stack_slots.t
 
 val get_and_incr_instruction_id : t -> Instruction.id
diff --git a/backend/regalloc/regalloc_rewrite.ml b/backend/regalloc/regalloc_rewrite.ml
index 88fb32e5c64..2b97e08f64d 100644
--- a/backend/regalloc/regalloc_rewrite.ml
+++ b/backend/regalloc/regalloc_rewrite.ml
@@ -261,8 +261,8 @@ let postlude :
   if Utils.debug
   then
     Array.iteri (Cfg_with_layout.cfg cfg_with_layout).fun_num_stack_slots
-      ~f:(fun reg_class num_stack_slots ->
-        Utils.log ~indent:1 "stack_slots[%d]=%d" reg_class num_stack_slots);
+      ~f:(fun stack_class num_stack_slots ->
+        Utils.log ~indent:1 "stack_slots[%d]=%d" stack_class num_stack_slots);
   remove_prologue_if_not_required cfg_with_layout;
   update_live_fields cfg_with_layout (Cfg_with_infos.liveness cfg_with_infos);
   f ();
diff --git a/backend/regalloc/regalloc_stack_slots.ml b/backend/regalloc/regalloc_stack_slots.ml
index e455f7e1964..13314fa702f 100644
--- a/backend/regalloc/regalloc_stack_slots.ml
+++ b/backend/regalloc/regalloc_stack_slots.ml
@@ -14,22 +14,22 @@ type t =
 
 let[@inline] make () =
   let stack_slots = Reg.Tbl.create 128 in
-  let num_stack_slots = Array.make Proc.num_register_classes 0 in
+  let num_stack_slots = Array.make Proc.num_stack_slot_classes 0 in
   { stack_slots; num_stack_slots }
 
-let[@inline] size_for_all_reg_classes t =
+let[@inline] size_for_all_stack_classes t =
   Array.fold_left t.num_stack_slots ~f:( + ) ~init:0
 
-let[@inline] get_and_incr t ~reg_class =
-  let res = t.num_stack_slots.(reg_class) in
-  t.num_stack_slots.(reg_class) <- succ res;
+let[@inline] get_and_incr t ~stack_class =
+  let res = t.num_stack_slots.(stack_class) in
+  t.num_stack_slots.(stack_class) <- succ res;
   res
 
 let[@inline] get_or_create t reg =
   match Reg.Tbl.find_opt t.stack_slots reg with
   | Some slot -> slot
   | None ->
-    let res = get_and_incr t ~reg_class:(Proc.register_class reg) in
+    let res = get_and_incr t ~stack_class:(Proc.stack_slot_class reg.Reg.typ) in
     Reg.Tbl.replace t.stack_slots reg res;
     res
 
@@ -47,12 +47,12 @@ let[@inline] update_cfg_with_layout t cfg_with_layout =
   let fun_num_stack_slots =
     (Cfg_with_layout.cfg cfg_with_layout).fun_num_stack_slots
   in
-  for reg_class = 0 to pred Proc.num_register_classes do
-    fun_num_stack_slots.(reg_class) <- t.num_stack_slots.(reg_class)
+  for stack_class = 0 to pred Proc.num_stack_slot_classes do
+    fun_num_stack_slots.(stack_class) <- t.num_stack_slots.(stack_class)
   done
 
 (** The optimization below is conceptually fairly close to what linscan does:
-   - for each register class / stack slot couple, we compute the interval of
+   - for each stack slot class / stack slot couple, we compute the interval of
      uses;
    - we re-assign slots by putting in the same "bucket" slots whose
      intervals do not overlap.
@@ -138,7 +138,7 @@ module Intervals : sig
   type slots = t
 
   type t = Interval.t array array
-  (* first index is register class, second index is slot index *)
+  (* first index is stack slot class, second index is slot index *)
 
   val build_from_cfg : slots -> Cfg_with_infos.t -> t
 
@@ -148,14 +148,14 @@ with type slots := t = struct
   type t = Interval.t array array
 
   let make slots =
-    Array.init Proc.num_register_classes ~f:(fun reg_class ->
-        Array.init slots.num_stack_slots.(reg_class) ~f:(fun _ ->
+    Array.init Proc.num_stack_slot_classes ~f:(fun stack_class ->
+        Array.init slots.num_stack_slots.(stack_class) ~f:(fun _ ->
             { Interval.start = Point.dummy; end_ = Point.dummy }))
 
   let visit_reg (t : t) (point : Point.t) (reg : Reg.t) : unit =
     apply_reg_stack_local reg ~f:(fun slot_index ->
-        let reg_class = Proc.register_class reg in
-        let interval = t.(reg_class).(slot_index) in
+        let stack_class = Proc.stack_slot_class reg.typ in
+        let interval = t.(stack_class).(slot_index) in
         if interval.start == Point.dummy then interval.start <- point;
         interval.end_ <- point)
 
@@ -194,10 +194,10 @@ with type slots := t = struct
     intervals
 
   let print ppf t =
-    Array.iteri t ~f:(fun reg_class intervals ->
+    Array.iteri t ~f:(fun stack_class intervals ->
         Array.iteri intervals ~f:(fun slot_index interval ->
-            Format.fprintf ppf "reg_class=%d slot_index=%d -> %a\n" reg_class
-              slot_index Interval.print interval))
+            Format.fprintf ppf "stack_class=%d slot_index=%d -> %a\n"
+              stack_class slot_index Interval.print interval))
 end
 
 module Int = Numbers.Int
@@ -211,13 +211,13 @@ module Buckets : sig
 
   val contains_empty : t -> bool
 
-  val find_bucket : t -> reg_class:int -> slot_index:slot -> int option
+  val find_bucket : t -> stack_class:int -> slot_index:slot -> int option
 
   val print : Format.formatter -> t -> unit
 end
 with type slots := t = struct
   type t = Interval.t Int.Tbl.t array array
-  (* first index is register class, second index is bucket index, table key is
+  (* first index is stack slot class, second index is bucket index, table key is
      slot index *)
 
   let does_not_fit (bucket : Interval.t Int.Tbl.t) (interval : Interval.t) :
@@ -232,13 +232,13 @@ with type slots := t = struct
 
   let build_from_intervals slots intervals =
     let buckets =
-      Array.init Proc.num_register_classes ~f:(fun reg_class ->
-          let num_slots = slots.num_stack_slots.(reg_class) in
+      Array.init Proc.num_stack_slot_classes ~f:(fun stack_class ->
+          let num_slots = slots.num_stack_slots.(stack_class) in
           Array.init num_slots ~f:(fun _ -> Int.Tbl.create num_slots))
     in
-    Array.iteri intervals ~f:(fun reg_class intervals ->
+    Array.iteri intervals ~f:(fun stack_class intervals ->
         Array.iteri intervals ~f:(fun slot_index interval ->
-            let buckets = buckets.(reg_class) in
+            let buckets = buckets.(stack_class) in
             let bucket_index = ref 0 in
             while
               !bucket_index < Array.length buckets
@@ -259,8 +259,8 @@ with type slots := t = struct
           let last_bucket = buckets.(len - 1) in
           Int.Tbl.length last_bucket = 0)
 
-  let find_bucket t ~reg_class ~slot_index =
-    let buckets = t.(reg_class) in
+  let find_bucket t ~stack_class ~slot_index =
+    let buckets = t.(stack_class) in
     let len = Array.length buckets in
     let bucket_index = ref 0 in
     while
@@ -272,9 +272,9 @@ with type slots := t = struct
     if !bucket_index < len then Some !bucket_index else None
 
   let print ppf t =
-    Array.iteri t ~f:(fun reg_class buckets ->
+    Array.iteri t ~f:(fun stack_class buckets ->
         Array.iteri buckets ~f:(fun bucket_index bucket ->
-            Format.fprintf ppf "reg_class=%d bucket_index=%d\n" reg_class
+            Format.fprintf ppf "stack_class=%d bucket_index=%d\n" stack_class
               bucket_index;
             Int.Tbl.iter
               (fun slot_index interval ->
@@ -284,7 +284,7 @@ with type slots := t = struct
 end
 
 let optimize (t : t) (cfg_with_infos : Cfg_with_infos.t) : unit =
-  if size_for_all_reg_classes t > 0
+  if size_for_all_stack_classes t > 0
   then (
     (* First, compute the intervals for all stack slots *)
     let intervals = Intervals.build_from_cfg t cfg_with_infos in
@@ -299,36 +299,38 @@ let optimize (t : t) (cfg_with_infos : Cfg_with_infos.t) : unit =
     (* Finally, if so, reassign the slot indices *)
     if optimized
     then (
-      let max_bucket_indices = Array.make Proc.num_register_classes (-1) in
+      let max_bucket_indices = Array.make Proc.num_stack_slot_classes (-1) in
       List.iter (Reg.all_registers ()) ~f:(fun (reg : Reg.t) ->
           apply_reg_stack_local reg ~f:(fun slot_index ->
-              let reg_class = Proc.register_class reg in
-              match Buckets.find_bucket buckets ~reg_class ~slot_index with
+              let stack_class = Proc.stack_slot_class reg.typ in
+              match Buckets.find_bucket buckets ~stack_class ~slot_index with
               | None ->
-                fatal "slot %d (reg_class=%d) is not in any of the buckets"
-                  slot_index reg_class
+                fatal "slot %d (stack_class=%d) is not in any of the buckets"
+                  slot_index stack_class
               | Some bucket_index ->
                 if debug
                 then
                   Format.eprintf
                     "changing the slot index of %a (class %d): %d ~> %d\n%!"
-                    Printmach.reg reg reg_class slot_index bucket_index;
+                    Printmach.reg reg stack_class slot_index bucket_index;
                 reg.loc <- Stack (Local bucket_index);
-                max_bucket_indices.(reg_class)
-                  <- Stdlib.Int.max max_bucket_indices.(reg_class) bucket_index;
+                max_bucket_indices.(stack_class)
+                  <- Stdlib.Int.max
+                       max_bucket_indices.(stack_class)
+                       bucket_index;
                 if Reg.Tbl.mem t.stack_slots reg
                 then Reg.Tbl.replace t.stack_slots reg bucket_index));
-      for reg_class = 0 to pred Proc.num_register_classes do
-        let old_value = t.num_stack_slots.(reg_class) in
-        let new_value = succ max_bucket_indices.(reg_class) in
+      for stack_class = 0 to pred Proc.num_stack_slot_classes do
+        let old_value = t.num_stack_slots.(stack_class) in
+        let new_value = succ max_bucket_indices.(stack_class) in
         if new_value > old_value
         then
           fatal "more slots are now used for class %d (before: %d, after: %d)"
-            reg_class old_value new_value;
+            stack_class old_value new_value;
         if debug
         then
-          Format.eprintf "reg_class %d has %d fewer slots (%d ~> %d)\n%!"
-            reg_class (old_value - new_value) old_value new_value;
-        t.num_stack_slots.(reg_class) <- new_value
+          Format.eprintf "stack_class %d has %d fewer slots (%d ~> %d)\n%!"
+            stack_class (old_value - new_value) old_value new_value;
+        t.num_stack_slots.(stack_class) <- new_value
       done;
       Cfg_with_infos.invalidate_liveness cfg_with_infos))
diff --git a/backend/regalloc/regalloc_stack_slots.mli b/backend/regalloc/regalloc_stack_slots.mli
index 77eaa4f1389..b24de06185a 100644
--- a/backend/regalloc/regalloc_stack_slots.mli
+++ b/backend/regalloc/regalloc_stack_slots.mli
@@ -6,9 +6,9 @@ type t
 
 val make : unit -> t
 
-val size_for_all_reg_classes : t -> int
+val size_for_all_stack_classes : t -> int
 
-val get_and_incr : t -> reg_class:int -> slot
+val get_and_incr : t -> stack_class:int -> slot
 
 val get_or_create : t -> Reg.t -> slot
 
diff --git a/backend/regalloc/regalloc_utils.ml b/backend/regalloc/regalloc_utils.ml
index 3c6a97219fe..3c62c74f380 100644
--- a/backend/regalloc/regalloc_utils.ml
+++ b/backend/regalloc/regalloc_utils.ml
@@ -250,6 +250,10 @@ let same_reg_class : Reg.t -> Reg.t -> bool =
  fun reg1 reg2 ->
   Int.equal (Proc.register_class reg1) (Proc.register_class reg2)
 
+let same_stack_class : Reg.t -> Reg.t -> bool =
+ fun reg1 reg2 ->
+  Int.equal (Proc.stack_slot_class reg1.typ) (Proc.stack_slot_class reg2.typ)
+
 let make_temporary :
     same_class_and_base_name_as:Reg.t -> name_prefix:string -> Reg.t =
  fun ~same_class_and_base_name_as:reg ~name_prefix ->
diff --git a/backend/regalloc/regalloc_utils.mli b/backend/regalloc/regalloc_utils.mli
index a1b48ca8560..2a114a61f2d 100644
--- a/backend/regalloc/regalloc_utils.mli
+++ b/backend/regalloc/regalloc_utils.mli
@@ -92,6 +92,8 @@ end
 
 val same_reg_class : Reg.t -> Reg.t -> bool
 
+val same_stack_class : Reg.t -> Reg.t -> bool
+
 val make_temporary :
   same_class_and_base_name_as:Reg.t -> name_prefix:string -> Reg.t
 
diff --git a/backend/regalloc/regalloc_validate.ml b/backend/regalloc/regalloc_validate.ml
index 42c5bc7e6ca..53d83347919 100644
--- a/backend/regalloc/regalloc_validate.ml
+++ b/backend/regalloc/regalloc_validate.ml
@@ -25,7 +25,7 @@ module Location : sig
 
   val to_loc_lossy : t -> Reg.location
 
-  val print : Format.formatter -> t -> unit
+  val print : Cmm.machtype_component -> Format.formatter -> t -> unit
 
   val equal : t -> t -> bool
 
@@ -35,15 +35,15 @@ module Location : sig
 end = struct
   module Stack = struct
     (** This type is based on [Reg.stack_location]. The first difference is that
-        for [Stack (Local index)] this types additionally stores [reg_class]
-        because local stacks are separate for different register classes.
+        for [Stack (Local index)] this types additionally stores [stack_class]
+        because local stacks are separate for different stack slot classes.
         Secondly for all stacks it stores index in words and not byte offset.
         That gives the guarantee that if indices are different then the
         locations do not overlap. *)
     type t =
       | Local of
           { index : int;
-            reg_class : int
+            stack_class : int
           }
       | Incoming of { index : int }
       | Outgoing of { index : int }
@@ -84,9 +84,9 @@ end = struct
 
     let word_index_to_byte_offset index = index * word_size
 
-    let of_stack_loc ~reg_class loc =
+    let of_stack_loc ~stack_class loc =
       match loc with
-      | Reg.Local index -> Local { index; reg_class }
+      | Reg.Local index -> Local { index; stack_class }
       | Reg.Incoming offset ->
         Incoming { index = byte_offset_to_word_index offset }
       | Reg.Outgoing offset ->
@@ -101,13 +101,6 @@ end = struct
       | Outgoing { index } -> Reg.Outgoing (word_index_to_byte_offset index)
       | Domainstate { index } ->
         Reg.Domainstate (word_index_to_byte_offset index)
-
-    let unknown_reg_class = -1
-
-    let reg_class_lossy t =
-      match t with
-      | Local { reg_class; _ } -> reg_class
-      | Incoming _ | Outgoing _ | Domainstate _ -> unknown_reg_class
   end
 
   type t =
@@ -120,7 +113,10 @@ end = struct
     | Reg.Reg idx -> Some (Reg idx)
     | Reg.Stack stack ->
       Some
-        (Stack (Stack.of_stack_loc ~reg_class:(Proc.register_class reg) stack))
+        (Stack
+           (Stack.of_stack_loc
+              ~stack_class:(Proc.stack_slot_class reg.Reg.typ)
+              stack))
 
   let of_reg_exn reg = of_reg reg |> Option.get
 
@@ -131,13 +127,8 @@ end = struct
     | Reg idx -> Reg.Reg idx
     | Stack stack -> Reg.Stack (Stack.to_stack_loc_lossy stack)
 
-  let reg_class_lossy t =
-    match t with Reg _ -> -1 | Stack stack -> Stack.reg_class_lossy stack
-
-  let print ppf t =
-    Printmach.loc ~reg_class:(reg_class_lossy t)
-      ~unknown:(fun _ -> assert false)
-      ppf (to_loc_lossy t)
+  let print typ ppf t =
+    Printmach.loc ~unknown:(fun _ -> assert false) ppf (to_loc_lossy t) typ
 
   let compare (t1 : t) (t2 : t) : int =
     (* CR-someday azewierzejew: Implement proper comparison. *)
@@ -210,6 +201,8 @@ module Register : sig
 
   val print : Format.formatter -> t -> unit
 
+  val typ : t -> Cmm.machtype_component
+
   module Set : Set.S with type elt = t
 
   module Map : Map.S with type key = t
@@ -240,10 +233,12 @@ end = struct
       loc = Reg_id.to_loc_lossy t.reg_id
     }
 
+  let typ (t : t) = t.for_print.typ
+
   let print (ppf : Format.formatter) (t : t) : unit =
     match t.reg_id with
     | Preassigned { location } ->
-      Format.fprintf ppf "R[%a]" Location.print location
+      Format.fprintf ppf "R[%a]" (Location.print t.for_print.typ) location
     | Named _ -> Printmach.reg ppf (to_dummy_reg t)
 
   let compare (t1 : t) (t2 : t) : int = Reg_id.compare t1.reg_id t2.reg_id
@@ -454,7 +449,10 @@ end = struct
         | Preassigned { location = prev_loc }, Some new_loc ->
           Regalloc_utils.fatal
             "%s: changed preassigned register's location from %a to %a" context
-            Location.print prev_loc Location.print new_loc)
+            (Location.print (Register.typ reg_desc))
+            prev_loc
+            (Location.print loc_reg.Reg.typ)
+            new_loc)
       reg_arr loc_arr;
     ()
 
@@ -773,7 +771,9 @@ end = struct
     type t = Register.t * Location.t
 
     let print ppf (r, l) =
-      Format.fprintf ppf "%a=%a" Register.print r Location.print l
+      Format.fprintf ppf "%a=%a" Register.print r
+        (Location.print (Register.typ r))
+        l
   end
 
   exception Verification_failed of string
@@ -872,7 +872,7 @@ end = struct
       then (
         Format.fprintf Format.str_formatter
           "Unsatisfiable equations when removing result equations.\n\
-           Existing equation has to agree one 0 or 2 sides (cannot on exactly \
+           Existing equation has to agree on 0 or 2 sides (cannot be exactly \
            1) with the removed equation.\n\
            Existing equation %a.\n\
            Removed equation: %a." Equation.print (eq_reg, eq_loc) Equation.print
@@ -913,9 +913,10 @@ end = struct
           | None -> ()
           | Some regs ->
             assert (not (Register.Set.is_empty regs));
+            let typ = Register.Set.choose regs |> Register.typ in
             Format.fprintf Format.str_formatter
-              "Destroying a location %a in which a live registers %a is stored"
-              Location.print destroyed_loc
+              "Destroying a location %a in which live registers %a are stored"
+              (Location.print typ) destroyed_loc
               (Format.pp_print_seq
                  ~pp_sep:(fun ppf () -> Format.fprintf ppf ", ")
                  Register.print)
@@ -964,9 +965,9 @@ module type Description_value = sig
 end
 
 let print_reg_as_loc ppf reg =
-  Printmach.loc ~reg_class:(Proc.register_class reg)
+  Printmach.loc
     ~unknown:(fun ppf -> Format.fprintf ppf "<Unknown>")
-    ppf reg.Reg.loc
+    ppf reg.Reg.loc reg.Reg.typ
 
 module Domain : Cfg_dataflow.Domain_S with type t = Equation_set.t = struct
   (** This type corresponds to the set of equations in the dataflow from the
@@ -1125,7 +1126,7 @@ module Transfer (Desc_val : Description_value) :
                     equations
                     |> Equation_set.verify_destroyed_locations
                          ~destroyed:
-                           (Location.of_regs_exn Proc.destroyed_at_raise)
+                           (Location.of_regs_exn (Proc.destroyed_at_raise ()))
                     |> Result.map_error (fun message ->
                            Printf.sprintf
                              "While verifying locations destroyed at raise: %s"
@@ -1306,8 +1307,8 @@ end = struct
         Format.fprintf ppf "Function argument locations: %a\n"
           (Format.pp_print_seq
              ~pp_sep:(fun ppf () -> Format.pp_print_string ppf ", ")
-             Location.print)
-          (Array.to_seq loc_fun_args);
+             (fun ppf (reg, loc) -> Location.print (Register.typ reg) ppf loc))
+          (Array.to_seq (Array.combine reg_fun_args loc_fun_args));
         ()
   end
 
diff --git a/backend/reloadgen.ml b/backend/reloadgen.ml
index d5ca5521b3d..53cb179c8f1 100644
--- a/backend/reloadgen.ml
+++ b/backend/reloadgen.ml
@@ -67,7 +67,7 @@ method reload_operation op arg res =
       begin match arg.(0), res.(0) with
         {loc = Stack s1}, {loc = Stack s2} ->
           if s1 = s2
-          && Proc.register_class arg.(0) = Proc.register_class res.(0) then
+          && Proc.stack_slot_class arg.(0).typ = Proc.stack_slot_class res.(0).typ then
             (* nothing will be emitted later,
                not necessary to apply constraints *)
             (arg, res)
diff --git a/backend/selectgen.ml b/backend/selectgen.ml
index cc33b0ed374..7e513560f36 100644
--- a/backend/selectgen.ml
+++ b/backend/selectgen.ml
@@ -199,6 +199,7 @@ let oper_result_type = function
       begin match c with
       | Word_val -> typ_val
       | Single | Double -> typ_float
+      | Onetwentyeight -> typ_vec128
       | _ -> typ_int
       end
   | Calloc _ -> typ_val
@@ -236,6 +237,7 @@ let size_component = function
   | Val | Addr -> Arch.size_addr
   | Int -> Arch.size_int
   | Float -> Arch.size_float
+  | Vec128 -> Arch.size_vec128
 
 let size_machtype mty =
   let size = ref 0 in
@@ -250,6 +252,7 @@ let size_expr (env:environment) exp =
     | Cconst_symbol _ ->
         Arch.size_addr
     | Cconst_float _ -> Arch.size_float
+    | Cconst_vec128 _ -> Arch.size_vec128
     | Cvar id ->
         begin try
           V.Map.find id localenv
@@ -474,6 +477,7 @@ method is_simple_expr = function
   | Cconst_natint _ -> true
   | Cconst_float _ -> true
   | Cconst_symbol _ -> true
+  | Cconst_vec128 _ -> true
   | Cvar _ -> true
   | Ctuple el -> List.for_all self#is_simple_expr el
   | Clet(_id, arg, body) | Clet_mut(_id, _, arg, body) ->
@@ -519,7 +523,7 @@ method is_simple_expr = function
 method effects_of exp =
   let module EC = Effect_and_coeffect in
   match exp with
-  | Cconst_int _ | Cconst_natint _ | Cconst_float _ | Cconst_symbol _
+  | Cconst_int _ | Cconst_natint _ | Cconst_float _ | Cconst_symbol _ | Cconst_vec128 _
   | Cvar _ -> EC.none
   | Ctuple el -> EC.join_list_map el self#effects_of
   | Clet (_id, arg, body) | Clet_mut (_id, _, arg, body) ->
@@ -853,6 +857,9 @@ method emit_expr_aux (env:environment) exp :
   | Cconst_float (n, _dbg) ->
       let r = self#regs_for typ_float in
       ret (self#insert_op env (Iconst_float (Int64.bits_of_float n)) [||] r)
+  | Cconst_vec128 (bits, _dbg) ->
+    let r = self#regs_for typ_vec128 in
+    ret (self#insert_op env (Iconst_vec128 bits) [||] r)
   | Cconst_symbol (n, _dbg) ->
       (* Cconst_symbol _ evaluates to a statically-allocated address, so its
          value fits in a typ_int register and is never changed by the GC.
@@ -1372,7 +1379,11 @@ method emit_stores env data regs_addr =
             Istore(_, _, _) ->
               for i = 0 to Array.length regs - 1 do
                 let r = regs.(i) in
-                let kind = if r.typ = Float then Double else Word_val in
+                let kind = match r.typ with
+                  | Float -> Double
+                  | Vec128 -> Onetwentyeight
+                  | Val | Addr | Int ->  Word_val
+                in
                 self#insert env
                             (Iop(Istore(kind, !a, false)))
                             (Array.append [|r|] regs_addr) [||];
@@ -1613,7 +1624,7 @@ method emit_tail (env:environment) exp =
          self#emit_tail { env with regions } e
       end
   | Cop _
-  | Cconst_int _ | Cconst_natint _ | Cconst_float _ | Cconst_symbol _
+  | Cconst_int _ | Cconst_natint _ | Cconst_float _ | Cconst_symbol _ | Cconst_vec128 _
   | Cvar _
   | Cassign _
   | Ctuple _
@@ -1660,7 +1671,7 @@ method emit_fundecl ~future_funcnames f =
     fun_codegen_options = f.Cmm.fun_codegen_options;
     fun_dbg  = f.Cmm.fun_dbg;
     fun_poll = f.Cmm.fun_poll;
-    fun_num_stack_slots = Array.make Proc.num_register_classes 0;
+    fun_num_stack_slots = Array.make Proc.num_stack_slot_classes 0;
     fun_contains_calls = !contains_calls;
   }
 
diff --git a/backend/x86_ast.mli b/backend/x86_ast.mli
index 52b1c3b37cb..08b91732545 100644
--- a/backend/x86_ast.mli
+++ b/backend/x86_ast.mli
@@ -57,7 +57,8 @@ type constant =
 type data_type =
   | NONE
   | REAL4 | REAL8 (* floating point values *)
-  | BYTE | WORD | DWORD | QWORD | OWORD (* integer values *)
+  | BYTE | WORD | DWORD | QWORD (* integer values *)
+  | VEC128 (* vector values (float & integer) *)
   | NEAR | PROC
 
 type reg64 =
@@ -67,7 +68,6 @@ type reg64 =
 type reg8h =
   | AH | BH | CH | DH
 
-
 type regf = XMM of int
 
 type arch = X64 | X86
@@ -145,6 +145,7 @@ type instruction =
   | MINSD of arg * arg
   | MOV of arg * arg
   | MOVAPD of arg * arg
+  | MOVUPD of arg * arg
   | MOVD of arg * arg
   | MOVQ of arg * arg
   | MOVLPD of arg * arg
diff --git a/backend/x86_binary_emitter.ml b/backend/x86_binary_emitter.ml
index 15b11cf3adc..09ab0ed0d22 100644
--- a/backend/x86_binary_emitter.ml
+++ b/backend/x86_binary_emitter.ml
@@ -434,8 +434,8 @@ let emit_mod_rm_reg b rex opcodes rm reg =
       emit_rex b (rex lor rex_of_reg16 reg16 lor rexr_reg reg lor rexb_rm rm);
       buf_opcodes b opcodes;
       buf_int8 b (mod_rm_reg 0b11 rm reg)
-  | Regf rm ->
-      let rm = rd_of_regf rm in
+  | Regf regf ->
+      let rm = rd_of_regf regf in
       emit_rex b (rex lor rexr_reg reg lor rexb_rm rm);
       buf_opcodes b opcodes;
       buf_int8 b (mod_rm_reg 0b11 rm reg)
@@ -569,6 +569,16 @@ let emit_movapd b dst src =
       emit_mod_rm_reg b 0 [ 0x0f; 0x29 ] rm (rd_of_regf reg)
   | _ -> assert false
 
+let emit_movupd b dst src =
+  match (dst, src) with
+  | Regf reg, ((Regf _ | Mem _ | Mem64_RIP _) as rm) ->
+      buf_int8 b 0x66;
+      emit_mod_rm_reg b 0 [ 0x0f; 0x10 ] rm (rd_of_regf reg)
+  | ((Mem _ | Mem64_RIP _) as rm), Regf reg ->
+      buf_int8 b 0x66;
+      emit_mod_rm_reg b 0 [ 0x0f; 0x11 ] rm (rd_of_regf reg)
+  | _ -> assert false
+
 let emit_movd b ~dst ~src =
   match (dst, src) with
   | Regf reg, ((Reg32 _ | Mem _ | Mem64_RIP _) as rm) ->
@@ -1441,6 +1451,7 @@ let assemble_instr b loc = function
   | MINSD (src, dst) -> emit_minsd b ~dst ~src
   | MOV (src, dst) -> emit_MOV b dst src
   | MOVAPD (src, dst) -> emit_movapd b dst src
+  | MOVUPD (src, dst) -> emit_movupd b dst src
   | MOVD (src, dst) -> emit_movd b ~dst ~src
   | MOVQ (src, dst) -> emit_movq b ~dst ~src
   | MOVLPD (src, dst) -> emit_movlpd b dst src
diff --git a/backend/x86_dsl.ml b/backend/x86_dsl.ml
index 0e39a6d1eb7..f0b39857c1d 100644
--- a/backend/x86_dsl.ml
+++ b/backend/x86_dsl.ml
@@ -156,6 +156,7 @@ module I = struct
   let minsd x y = emit (MINSD (x,y))
   let mov x y = emit (MOV (x, y))
   let movapd x y = emit (MOVAPD (x, y))
+  let movupd x y = emit (MOVUPD (x, y))
   let movd x y = emit (MOVD (x, y))
   let movq x y = emit (MOVQ (x, y))
   let movsd x y = emit (MOVSD (x, y))
diff --git a/backend/x86_dsl.mli b/backend/x86_dsl.mli
index bfc9e4b1b8d..6dd82de113a 100644
--- a/backend/x86_dsl.mli
+++ b/backend/x86_dsl.mli
@@ -151,6 +151,7 @@ module I : sig
   val minsd: arg -> arg -> unit
   val mov: arg -> arg -> unit
   val movapd: arg -> arg -> unit
+  val movupd: arg -> arg -> unit
   val movd: arg -> arg -> unit
   val movq: arg -> arg -> unit
   val movsd: arg -> arg -> unit
diff --git a/backend/x86_gas.ml b/backend/x86_gas.ml
index e88e5efd708..f4c44076ecd 100644
--- a/backend/x86_gas.ml
+++ b/backend/x86_gas.ml
@@ -98,8 +98,8 @@ let suf arg =
   | DWORD | REAL8 -> "l"
   | QWORD -> "q"
   | REAL4 -> "s"
-  | NONE -> ""
-  | OWORD | NEAR | PROC -> assert false
+  | VEC128 | NONE -> ""
+  | NEAR | PROC -> assert false
 
 let i0 b s = bprintf b "\t%s" s
 let i1 b s x = bprintf b "\t%s\t%a" s arg x
@@ -164,6 +164,7 @@ let print_instr b = function
       i2 b "movabsq" arg1 arg2
   | MOV (arg1, arg2) -> i2_s b "mov" arg1 arg2
   | MOVAPD (arg1, arg2) -> i2 b "movapd" arg1 arg2
+  | MOVUPD (arg1, arg2) -> i2 b "movupd" arg1 arg2
   | MOVD (arg1, arg2) -> i2 b "movd" arg1 arg2
   | MOVQ (arg1, arg2) -> i2 b "movq" arg1 arg2
   | MOVLPD (arg1, arg2) -> i2 b "movlpd" arg1 arg2
diff --git a/backend/x86_masm.ml b/backend/x86_masm.ml
index ae78851ec58..5c681950aa2 100644
--- a/backend/x86_masm.ml
+++ b/backend/x86_masm.ml
@@ -19,8 +19,8 @@ open X86_proc
 let bprintf = Printf.bprintf
 
 let string_of_datatype = function
+  | VEC128 -> "XMMWORD"
   | QWORD -> "QWORD"
-  | OWORD -> "OWORD"
   | NONE -> assert false
   | REAL4 -> "REAL4"
   | REAL8 -> "REAL8"
@@ -32,8 +32,8 @@ let string_of_datatype = function
 
 
 let string_of_datatype_ptr = function
+  | VEC128 -> "XMMWORD PTR "
   | QWORD -> "QWORD PTR "
-  | OWORD -> "OWORD PTR "
   | NONE -> ""
   | REAL4 -> "REAL4 PTR "
   | REAL8 -> "REAL8 PTR "
@@ -161,6 +161,7 @@ let print_instr b = function
       i2 b "mov" arg1 (Reg32 r)
   | MOV (arg1, arg2) -> i2 b "mov" arg1 arg2
   | MOVAPD (arg1, arg2) -> i2 b "movapd" arg1 arg2
+  | MOVUPD (arg1, arg2) -> i2 b "movupd" arg1 arg2
   | MOVD (arg1, arg2) -> i2 b "movd" arg1 arg2
   | MOVQ (arg1, arg2) -> i2 b "movq" arg1 arg2
   | MOVLPD (arg1, arg2) -> i2 b "movlpd" arg1 arg2
diff --git a/file_formats/cmx_format.mli b/file_formats/cmx_format.mli
index 5d7fabc45fd..ef686fab74c 100644
--- a/file_formats/cmx_format.mli
+++ b/file_formats/cmx_format.mli
@@ -42,7 +42,7 @@ type export_info_raw =
   | Flambda2_raw of Flambda2_cmx.Flambda_cmx_format.raw option
 
 (* Declare machtype here to avoid depending on [Cmm]. *)
-type machtype_component = Val | Addr | Int | Float
+type machtype_component = Val | Addr | Int | Float | Vec128
 type machtype = machtype_component array
 
 type apply_fn := machtype list * machtype * Lambda.alloc_mode
diff --git a/middle_end/clambda.ml b/middle_end/clambda.ml
index c0c09a5c410..4bd7fd68e57 100644
--- a/middle_end/clambda.ml
+++ b/middle_end/clambda.ml
@@ -32,6 +32,7 @@ type ustructured_constant =
   | Uconst_int32 of int32
   | Uconst_int64 of int64
   | Uconst_nativeint of nativeint
+  | Uconst_vec128 of { high : int64; low : int64 }
   | Uconst_block of int * uconstant list
   | Uconst_float_array of float list
   | Uconst_string of string
@@ -212,6 +213,7 @@ let rank_structured_constant = function
   | Uconst_float_array _ -> 5
   | Uconst_string _ -> 6
   | Uconst_closure _ -> 7
+  | Uconst_vec128 _ -> 8
 
 let compare_structured_constants c1 c2 =
   match c1, c2 with
@@ -227,6 +229,9 @@ let compare_structured_constants c1 c2 =
   | Uconst_string s1, Uconst_string s2 -> String.compare s1 s2
   | Uconst_closure (_,lbl1,_), Uconst_closure (_,lbl2,_) ->
       String.compare lbl1 lbl2
+  | Uconst_vec128 {high = l0; low = l1}, Uconst_vec128 {high = r0; low = r1} ->
+    let cmp = Int64.compare l0 r0 in
+    if cmp = 0 then Int64.compare l1 r1 else cmp
   | _, _ ->
     (* no overflow possible here *)
     rank_structured_constant c1 - rank_structured_constant c2
diff --git a/middle_end/clambda.mli b/middle_end/clambda.mli
index 9e1409e5721..42250fb7010 100644
--- a/middle_end/clambda.mli
+++ b/middle_end/clambda.mli
@@ -32,6 +32,7 @@ type ustructured_constant =
   | Uconst_int32 of int32
   | Uconst_int64 of int64
   | Uconst_nativeint of nativeint
+  | Uconst_vec128 of { high : int64; low : int64 }
   | Uconst_block of int * uconstant list
   | Uconst_float_array of float list
   | Uconst_string of string
diff --git a/middle_end/clambda_primitives.ml b/middle_end/clambda_primitives.ml
index 92725a7a771..a550d96726e 100644
--- a/middle_end/clambda_primitives.ml
+++ b/middle_end/clambda_primitives.ml
@@ -158,18 +158,23 @@ and value_kind = Lambda.value_kind =
       non_consts : (int * value_kind list) list;
     }
   | Parrayval of array_kind
+  | Pboxedvectorval of boxed_vector
 
 and layout = Lambda.layout =
   | Ptop
   | Pvalue of value_kind
   | Punboxed_float
   | Punboxed_int of boxed_integer
+  | Punboxed_vector of boxed_vector
   | Pbottom
 
 and block_shape = Lambda.block_shape
 and boxed_integer = Primitive.boxed_integer =
     Pnativeint | Pint32 | Pint64
 
+and boxed_vector = Primitive.boxed_vector =
+  | Pvec128
+
 and bigarray_kind = Lambda.bigarray_kind =
     Pbigarray_unknown
   | Pbigarray_float32 | Pbigarray_float64
diff --git a/middle_end/clambda_primitives.mli b/middle_end/clambda_primitives.mli
index b65e674ee76..11ecd5bd378 100644
--- a/middle_end/clambda_primitives.mli
+++ b/middle_end/clambda_primitives.mli
@@ -161,18 +161,24 @@ and value_kind = Lambda.value_kind =
       non_consts : (int * value_kind list) list;
     }
   | Parrayval of array_kind
+  | Pboxedvectorval of boxed_vector
 
 and layout = Lambda.layout =
   | Ptop
   | Pvalue of value_kind
   | Punboxed_float
   | Punboxed_int of boxed_integer
+  | Punboxed_vector of boxed_vector
   | Pbottom
 
 and block_shape = Lambda.block_shape
+
 and boxed_integer = Primitive.boxed_integer =
     Pnativeint | Pint32 | Pint64
 
+and boxed_vector = Primitive.boxed_vector =
+  | Pvec128
+
 and bigarray_kind = Lambda.bigarray_kind =
     Pbigarray_unknown
   | Pbigarray_float32 | Pbigarray_float64
diff --git a/middle_end/closure/closure.ml b/middle_end/closure/closure.ml
index 95f7588333d..df139603f61 100644
--- a/middle_end/closure/closure.ml
+++ b/middle_end/closure/closure.ml
@@ -64,7 +64,9 @@ let is_gc_ignorable kind =
   | Punboxed_float -> true
   | Punboxed_int _ -> true
   | Pvalue Pintval -> true
-  | Pvalue (Pgenval | Pfloatval | Pboxedintval _ | Pvariant _ | Parrayval _) -> false
+  | Punboxed_vector _ -> true
+  | Pvalue (Pgenval | Pfloatval | Pboxedintval _ | Pboxedvectorval _
+      | Pvariant _ | Parrayval _) -> false
 
 let split_closure_fv kinds fv =
   List.fold_right (fun id (not_scanned, scanned) ->
@@ -1757,7 +1759,7 @@ let collect_exported_structured_constants a =
   and structured_constant = function
     | Uconst_block (_, ul) -> List.iter const ul
     | Uconst_float _ | Uconst_int32 _
-    | Uconst_int64 _ | Uconst_nativeint _
+    | Uconst_int64 _ | Uconst_nativeint _ | Uconst_vec128 _
     | Uconst_float_array _ | Uconst_string _ -> ()
     | Uconst_closure _ -> assert false (* Cannot be generated *)
   and ulam = function
diff --git a/middle_end/flambda/closure_offsets.ml b/middle_end/flambda/closure_offsets.ml
index cfb1791786d..d771cdd3e54 100644
--- a/middle_end/flambda/closure_offsets.ml
+++ b/middle_end/flambda/closure_offsets.ml
@@ -79,6 +79,7 @@ let add_closure_offsets
              and not stored in a closure."
         | Punboxed_float -> true
         | Punboxed_int _ -> true
+        | Punboxed_vector _ -> true
         | Pvalue Pintval -> true
         | Pvalue _ -> false)
       free_vars
diff --git a/middle_end/flambda/flambda_to_clambda.ml b/middle_end/flambda/flambda_to_clambda.ml
index a2d6c2a23f2..b2613464916 100644
--- a/middle_end/flambda/flambda_to_clambda.ml
+++ b/middle_end/flambda/flambda_to_clambda.ml
@@ -712,6 +712,7 @@ and to_clambda_set_of_closures t env
              and not stored in a closure."
         | Punboxed_float -> true
         | Punboxed_int _ -> true
+        | Punboxed_vector _ -> true
         | Pvalue Pintval -> true
         | Pvalue _ -> false)
       free_vars
diff --git a/middle_end/flambda2/from_lambda/closure_conversion.ml b/middle_end/flambda2/from_lambda/closure_conversion.ml
index 728991b2ca3..b835937e8c1 100644
--- a/middle_end/flambda2/from_lambda/closure_conversion.ml
+++ b/middle_end/flambda2/from_lambda/closure_conversion.ml
@@ -206,7 +206,7 @@ let find_value_approximation env simple =
       match Reg_width_const.descr const with
       | Tagged_immediate i -> Value_approximation.Value_int i
       | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-      | Naked_nativeint _ ->
+      | Naked_vec128 _ | Naked_nativeint _ ->
         Value_approximation.Value_unknown)
 
 let find_value_approximation_through_symbol acc env simple =
@@ -448,6 +448,8 @@ let close_c_call acc env ~loc ~let_bound_ids_with_kinds
       Some (P.Box_number (Naked_int32, Alloc_mode.For_allocations.heap))
     | _, Unboxed_integer Pint64 ->
       Some (P.Box_number (Naked_int64, Alloc_mode.For_allocations.heap))
+    | _, Unboxed_vector Pvec128 ->
+      Some (P.Box_number (Naked_vec128, Alloc_mode.For_allocations.heap))
     | _, Untagged_int -> Some P.Tag_immediate
   in
   let return_continuation, needs_wrapper =
@@ -473,6 +475,7 @@ let close_c_call acc env ~loc ~let_bound_ids_with_kinds
     | Unboxed_integer Pint32 -> K.naked_int32
     | Unboxed_integer Pint64 -> K.naked_int64
     | Untagged_int -> K.naked_immediate
+    | Unboxed_vector Pvec128 -> K.naked_vec128
   in
   let param_arity =
     List.map kind_of_primitive_native_repr prim_native_repr_args
@@ -559,6 +562,7 @@ let close_c_call acc env ~loc ~let_bound_ids_with_kinds
           | _, Unboxed_integer Pint32 -> Some (P.Unbox_number Naked_int32)
           | _, Unboxed_integer Pint64 -> Some (P.Unbox_number Naked_int64)
           | _, Untagged_int -> Some P.Untag_immediate
+          | _, Unboxed_vector Pvec128 -> Some (P.Unbox_number Naked_vec128)
         in
         match unbox_arg with
         | None -> fun args acc -> call (arg :: args) acc
diff --git a/middle_end/flambda2/from_lambda/closure_conversion_aux.ml b/middle_end/flambda2/from_lambda/closure_conversion_aux.ml
index 32008def512..11821118b77 100644
--- a/middle_end/flambda2/from_lambda/closure_conversion_aux.ml
+++ b/middle_end/flambda2/from_lambda/closure_conversion_aux.ml
@@ -468,9 +468,9 @@ module Acc = struct
           Block_approximation (fields, Alloc_mode.For_types.unknown ())
         else Value_unknown
       | Set_of_closures _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _
-      | Boxed_nativeint _ | Immutable_float_block _ | Immutable_float_array _
-      | Immutable_value_array _ | Empty_array | Mutable_string _
-      | Immutable_string _ ->
+      | Boxed_vec128 _ | Boxed_nativeint _ | Immutable_float_block _
+      | Immutable_float_array _ | Immutable_value_array _ | Empty_array
+      | Mutable_string _ | Immutable_string _ ->
         Value_unknown
     in
     let symbol_approximations =
diff --git a/middle_end/flambda2/from_lambda/lambda_to_flambda.ml b/middle_end/flambda2/from_lambda/lambda_to_flambda.ml
index b495613bb54..004cb76f423 100644
--- a/middle_end/flambda2/from_lambda/lambda_to_flambda.ml
+++ b/middle_end/flambda2/from_lambda/lambda_to_flambda.ml
@@ -1197,7 +1197,7 @@ let rec cps acc env ccenv (lam : L.lambda) (k : cps_continuation)
       let id = Ident.create_local name in
       let result_layout = L.primitive_result_layout prim in
       (match result_layout with
-      | Pvalue _ | Punboxed_float | Punboxed_int _ -> ()
+      | Pvalue _ | Punboxed_float | Punboxed_int _ | Punboxed_vector _ -> ()
       | Ptop | Pbottom ->
         Misc.fatal_errorf "Invalid result layout %a for primitive %a"
           Printlambda.layout result_layout Printlambda.primitive prim);
diff --git a/middle_end/flambda2/from_lambda/lambda_to_flambda_primitives.ml b/middle_end/flambda2/from_lambda/lambda_to_flambda_primitives.ml
index 00fc7108122..cd2c1c99c4a 100644
--- a/middle_end/flambda2/from_lambda/lambda_to_flambda_primitives.ml
+++ b/middle_end/flambda2/from_lambda/lambda_to_flambda_primitives.ml
@@ -429,6 +429,8 @@ let bigarray_box_or_tag_raw_value_to_read kind alloc_mode =
     fun arg -> H.Unary (Box_number (Naked_int64, alloc_mode), Prim arg)
   | Naked_number Naked_nativeint ->
     fun arg -> H.Unary (Box_number (Naked_nativeint, alloc_mode), Prim arg)
+  | Naked_number Naked_vec128 ->
+    fun arg -> H.Unary (Box_number (Naked_vec128, alloc_mode), Prim arg)
   | Region -> error "a region expression"
   | Rec_info -> error "recursion info"
 
@@ -449,6 +451,8 @@ let bigarray_unbox_or_untag_value_to_store kind =
     fun arg -> H.Prim (Unary (Unbox_number Naked_int64, arg))
   | Naked_number Naked_nativeint ->
     fun arg -> H.Prim (Unary (Unbox_number Naked_nativeint, arg))
+  | Naked_number Naked_vec128 ->
+    fun arg -> H.Prim (Unary (Unbox_number Naked_vec128, arg))
   | Region -> error "a region expression"
   | Rec_info -> error "recursion info"
 
diff --git a/middle_end/flambda2/identifiers/int_ids.ml b/middle_end/flambda2/identifiers/int_ids.ml
index 7b708fa4684..f1c6d4f6fe3 100644
--- a/middle_end/flambda2/identifiers/int_ids.ml
+++ b/middle_end/flambda2/identifiers/int_ids.ml
@@ -48,6 +48,7 @@ module Const_data = struct
     | Naked_int32 of Int32.t
     | Naked_int64 of Int64.t
     | Naked_nativeint of Targetint_32_64.t
+    | Naked_vec128 of Numeric_types.Vec128_by_bit_pattern.t
 
   let flags = const_flags
 
@@ -86,6 +87,11 @@ module Const_data = struct
           Flambda_colours.naked_number
           Targetint_32_64.print n
           Flambda_colours.pop
+      | Naked_vec128 v ->
+        Format.fprintf ppf "%t#%a%t"
+          Flambda_colours.naked_number
+          Numeric_types.Vec128_by_bit_pattern.print v
+          Flambda_colours.pop
 
     let compare t1 t2 =
       match t1, t2 with
@@ -97,6 +103,8 @@ module Const_data = struct
       | Naked_int32 n1, Naked_int32 n2 -> Int32.compare n1 n2
       | Naked_int64 n1, Naked_int64 n2 -> Int64.compare n1 n2
       | Naked_nativeint n1, Naked_nativeint n2 -> Targetint_32_64.compare n1 n2
+      | Naked_vec128 v1, Naked_vec128 v2 ->
+        Numeric_types.Vec128_by_bit_pattern.compare v1 v2
       | Naked_immediate _, _ -> -1
       | _, Naked_immediate _ -> 1
       | Tagged_immediate _, _ -> -1
@@ -107,6 +115,8 @@ module Const_data = struct
       | _, Naked_int32 _ -> 1
       | Naked_int64 _, _ -> -1
       | _, Naked_int64 _ -> 1
+      | Naked_vec128 _, _ -> -1
+      | _, Naked_vec128 _ -> 1
 
     let equal t1 t2 =
       if t1 == t2
@@ -121,8 +131,11 @@ module Const_data = struct
         | Naked_int32 n1, Naked_int32 n2 -> Int32.equal n1 n2
         | Naked_int64 n1, Naked_int64 n2 -> Int64.equal n1 n2
         | Naked_nativeint n1, Naked_nativeint n2 -> Targetint_32_64.equal n1 n2
+        | Naked_vec128 v1, Naked_vec128 v2 ->
+          Numeric_types.Vec128_by_bit_pattern.equal v1 v2
         | ( ( Naked_immediate _ | Tagged_immediate _ | Naked_float _
-            | Naked_int32 _ | Naked_int64 _ | Naked_nativeint _ ),
+            | Naked_vec128 _ | Naked_int32 _ | Naked_int64 _ | Naked_nativeint _
+              ),
             _ ) ->
           false
 
@@ -134,6 +147,7 @@ module Const_data = struct
       | Naked_int32 n -> Hashtbl.hash n
       | Naked_int64 n -> Hashtbl.hash n
       | Naked_nativeint n -> Targetint_32_64.hash n
+      | Naked_vec128 v -> Numeric_types.Vec128_by_bit_pattern.hash v
   end)
 end
 
@@ -264,6 +278,8 @@ module Const = struct
 
   let naked_nativeint i = create (Naked_nativeint i)
 
+  let naked_vec128 i = create (Naked_vec128 i)
+
   let const_true = tagged_immediate Targetint_31_63.bool_true
 
   let const_false = tagged_immediate Targetint_31_63.bool_false
diff --git a/middle_end/flambda2/identifiers/int_ids.mli b/middle_end/flambda2/identifiers/int_ids.mli
index 5d8b3b374d2..b33d3123adc 100644
--- a/middle_end/flambda2/identifiers/int_ids.mli
+++ b/middle_end/flambda2/identifiers/int_ids.mli
@@ -60,6 +60,8 @@ module Const : sig
 
   val naked_nativeint : Targetint_32_64.t -> t
 
+  val naked_vec128 : Numeric_types.Vec128_by_bit_pattern.t -> t
+
   module Descr : sig
     type t = private
       | Naked_immediate of Targetint_31_63.t
@@ -68,6 +70,7 @@ module Const : sig
       | Naked_int32 of Int32.t
       | Naked_int64 of Int64.t
       | Naked_nativeint of Targetint_32_64.t
+      | Naked_vec128 of Numeric_types.Vec128_by_bit_pattern.t
 
     include Container_types.S with type t := t
   end
diff --git a/middle_end/flambda2/identifiers/reg_width_const.ml b/middle_end/flambda2/identifiers/reg_width_const.ml
index e61472864f9..5f4311314f1 100644
--- a/middle_end/flambda2/identifiers/reg_width_const.ml
+++ b/middle_end/flambda2/identifiers/reg_width_const.ml
@@ -24,3 +24,4 @@ let of_descr (descr : Descr.t) =
   | Naked_int32 i -> naked_int32 i
   | Naked_int64 i -> naked_int64 i
   | Naked_nativeint i -> naked_nativeint i
+  | Naked_vec128 v -> naked_vec128 v
diff --git a/middle_end/flambda2/kinds/flambda_kind.ml b/middle_end/flambda2/kinds/flambda_kind.ml
index a014ec6e7f8..0669bd8cb47 100644
--- a/middle_end/flambda2/kinds/flambda_kind.ml
+++ b/middle_end/flambda2/kinds/flambda_kind.ml
@@ -21,6 +21,7 @@ module Naked_number_kind = struct
     | Naked_int32
     | Naked_int64
     | Naked_nativeint
+    | Naked_vec128
 
   let print ppf t =
     match t with
@@ -29,6 +30,7 @@ module Naked_number_kind = struct
     | Naked_int32 -> Format.pp_print_string ppf "Naked_int32"
     | Naked_int64 -> Format.pp_print_string ppf "Naked_int64"
     | Naked_nativeint -> Format.pp_print_string ppf "Naked_nativeint"
+    | Naked_vec128 -> Format.pp_print_string ppf "Naked_vec128"
 end
 
 type t =
@@ -53,6 +55,8 @@ let naked_int64 = Naked_number Naked_int64
 
 let naked_nativeint = Naked_number Naked_nativeint
 
+let naked_vec128 = Naked_number Naked_vec128
+
 let region = Region
 
 let rec_info = Rec_info
@@ -66,6 +70,7 @@ let to_lambda (t : t) : Lambda.layout =
   | Naked_number Naked_int32 -> Punboxed_int Pint32
   | Naked_number Naked_int64 -> Punboxed_int Pint64
   | Naked_number Naked_nativeint -> Punboxed_int Pnativeint
+  | Naked_number Naked_vec128 -> Punboxed_vector Pvec128
   | Region -> Misc.fatal_error "Can't convert kind [Region] to lambda layout"
   | Rec_info ->
     Misc.fatal_error "Can't convert kind [Rec_info] to lambda layout"
@@ -105,6 +110,9 @@ include Container_types.Make (struct
         | Naked_nativeint ->
           Format.fprintf ppf "%t@<1>\u{2115}@<1>\u{2115}%t" colour
             Flambda_colours.pop
+        | Naked_vec128 ->
+          Format.fprintf ppf "%t@<1>\u{2115}@<1>\u{1d54d}128%t" colour
+            Flambda_colours.pop
       else
         Format.fprintf ppf "(Naked_number %a)" Naked_number_kind.print
           naked_number_kind
@@ -127,7 +135,9 @@ let is_naked_float t =
   match t with
   | Naked_number Naked_float -> true
   | Value
-  | Naked_number (Naked_immediate | Naked_int32 | Naked_int64 | Naked_nativeint)
+  | Naked_number
+      ( Naked_immediate | Naked_int32 | Naked_int64 | Naked_nativeint
+      | Naked_vec128 )
   | Region | Rec_info ->
     false
 
@@ -227,6 +237,7 @@ module Boxable_number = struct
     | Naked_int32
     | Naked_int64
     | Naked_nativeint
+    | Naked_vec128
 
   let unboxed_kind t : kind =
     match t with
@@ -234,10 +245,11 @@ module Boxable_number = struct
     | Naked_int32 -> Naked_number Naked_int32
     | Naked_int64 -> Naked_number Naked_int64
     | Naked_nativeint -> Naked_number Naked_nativeint
+    | Naked_vec128 -> Naked_number Naked_vec128
 
   let primitive_kind t : Primitive.boxed_integer =
     match t with
-    | Naked_float -> assert false
+    | Naked_vec128 | Naked_float -> assert false
     | Naked_int32 -> Pint32
     | Naked_int64 -> Pint64
     | Naked_nativeint -> Pnativeint
@@ -251,6 +263,7 @@ module Boxable_number = struct
       | Naked_int32 -> Format.pp_print_string ppf "Naked_int32"
       | Naked_int64 -> Format.pp_print_string ppf "Naked_int64"
       | Naked_nativeint -> Format.pp_print_string ppf "Naked_nativeint"
+      | Naked_vec128 -> Format.pp_print_string ppf "Naked_vec128"
 
     let compare = Stdlib.compare
 
@@ -265,6 +278,7 @@ module Boxable_number = struct
     | Naked_int32 -> Format.pp_print_string ppf "naked_int32"
     | Naked_int64 -> Format.pp_print_string ppf "naked_int64"
     | Naked_nativeint -> Format.pp_print_string ppf "naked_nativeint"
+    | Naked_vec128 -> Format.pp_print_string ppf "naked_vec128"
 
   let print_lowercase_short ppf t =
     match t with
@@ -272,6 +286,7 @@ module Boxable_number = struct
     | Naked_int32 -> Format.pp_print_string ppf "int32"
     | Naked_int64 -> Format.pp_print_string ppf "int64"
     | Naked_nativeint -> Format.pp_print_string ppf "nativeint"
+    | Naked_vec128 -> Format.pp_print_string ppf "vec128"
 end
 
 module With_subkind = struct
@@ -282,6 +297,7 @@ module With_subkind = struct
       | Boxed_int32
       | Boxed_int64
       | Boxed_nativeint
+      | Boxed_vec128
       | Tagged_immediate
       | Variant of
           { consts : Targetint_31_63.Set.t;
@@ -306,6 +322,7 @@ module With_subkind = struct
       | Boxed_int32, Boxed_int32
       | Boxed_int64, Boxed_int64
       | Boxed_nativeint, Boxed_nativeint
+      | Boxed_vec128, Boxed_vec128
       | Tagged_immediate, Tagged_immediate
       | Float_array, Float_array
       | Immediate_array, Immediate_array
@@ -353,8 +370,8 @@ module With_subkind = struct
         true
       (* All other combinations are incompatible: *)
       | ( ( Anything | Boxed_float | Boxed_int32 | Boxed_int64 | Boxed_nativeint
-          | Tagged_immediate | Variant _ | Float_block _ | Float_array
-          | Immediate_array | Value_array | Generic_array ),
+          | Boxed_vec128 | Tagged_immediate | Variant _ | Float_block _
+          | Float_array | Immediate_array | Value_array | Generic_array ),
           _ ) ->
         false
 
@@ -380,6 +397,9 @@ module With_subkind = struct
         | Boxed_nativeint ->
           Format.fprintf ppf "%t=boxed_@<1>\u{2115}@<1>\u{2115}%t" colour
             Flambda_colours.pop
+        | Boxed_vec128 ->
+          Format.fprintf ppf "%t=boxed_@<1>\u{2115}@<1>\u{1d54d}128%t" colour
+            Flambda_colours.pop
         | Variant { consts; non_consts } ->
           let print_field ppf { kind = _; subkind } = print ppf subkind in
           Format.fprintf ppf "%t=Variant((consts (%a))@ (non_consts (%a)))%t"
@@ -420,7 +440,7 @@ module With_subkind = struct
     | Naked_number _ | Region | Rec_info -> (
       match subkind with
       | Anything -> ()
-      | Boxed_float | Boxed_int32 | Boxed_int64 | Boxed_nativeint
+      | Boxed_float | Boxed_int32 | Boxed_int64 | Boxed_nativeint | Boxed_vec128
       | Tagged_immediate | Variant _ | Float_block _ | Float_array
       | Immediate_array | Value_array | Generic_array ->
         Misc.fatal_errorf "Subkind %a is not valid for kind %a" Subkind.print
@@ -449,6 +469,8 @@ module With_subkind = struct
 
   let naked_nativeint = create naked_nativeint Anything
 
+  let naked_vec128 = create naked_vec128 Anything
+
   let region = create region Anything
 
   let boxed_float = create value Boxed_float
@@ -459,6 +481,8 @@ module With_subkind = struct
 
   let boxed_nativeint = create value Boxed_nativeint
 
+  let boxed_vec128 = create value Boxed_vec128
+
   let tagged_immediate = create value Tagged_immediate
 
   let rec_info = create rec_info Anything
@@ -495,6 +519,7 @@ module With_subkind = struct
     | Naked_int32 -> naked_int32
     | Naked_int64 -> naked_int64
     | Naked_nativeint -> naked_nativeint
+    | Naked_vec128 -> naked_vec128
 
   let rec from_lambda_value_kind (vk : Lambda.value_kind) =
     match vk with
@@ -503,6 +528,7 @@ module With_subkind = struct
     | Pboxedintval Pint32 -> boxed_int32
     | Pboxedintval Pint64 -> boxed_int64
     | Pboxedintval Pnativeint -> boxed_nativeint
+    | Pboxedvectorval Pvec128 -> boxed_vec128
     | Pintval -> tagged_immediate
     | Pvariant { consts; non_consts } -> (
       match consts, non_consts with
@@ -544,6 +570,7 @@ module With_subkind = struct
     | Punboxed_int Pint32 -> naked_int32
     | Punboxed_int Pint64 -> naked_int64
     | Punboxed_int Pnativeint -> naked_nativeint
+    | Punboxed_vector Pvec128 -> naked_vec128
 
   include Container_types.Make (struct
     type nonrec t = t
@@ -555,8 +582,8 @@ module With_subkind = struct
         Format.fprintf ppf "@[%a%a@]" print kind Subkind.print subkind
       | ( (Naked_number _ | Region | Rec_info),
           ( Boxed_float | Boxed_int32 | Boxed_int64 | Boxed_nativeint
-          | Tagged_immediate | Variant _ | Float_block _ | Float_array
-          | Immediate_array | Value_array | Generic_array ) ) ->
+          | Boxed_vec128 | Tagged_immediate | Variant _ | Float_block _
+          | Float_array | Immediate_array | Value_array | Generic_array ) ) ->
         assert false
     (* see [create] *)
 
@@ -574,7 +601,7 @@ module With_subkind = struct
   let has_useful_subkind_info (t : t) =
     match t.subkind with
     | Anything -> false
-    | Boxed_float | Boxed_int32 | Boxed_int64 | Boxed_nativeint
+    | Boxed_float | Boxed_int32 | Boxed_int64 | Boxed_nativeint | Boxed_vec128
     | Tagged_immediate | Variant _ | Float_block _ | Float_array
     | Immediate_array | Value_array | Generic_array ->
       true
diff --git a/middle_end/flambda2/kinds/flambda_kind.mli b/middle_end/flambda2/kinds/flambda_kind.mli
index 78658a4de9c..8b0451160ff 100644
--- a/middle_end/flambda2/kinds/flambda_kind.mli
+++ b/middle_end/flambda2/kinds/flambda_kind.mli
@@ -23,6 +23,7 @@ module Naked_number_kind : sig
     | Naked_int32
     | Naked_int64
     | Naked_nativeint
+    | Naked_vec128
 
   val print : Format.formatter -> t -> unit
 end
@@ -56,6 +57,8 @@ val naked_int64 : t
 
 val naked_nativeint : t
 
+val naked_vec128 : t
+
 val region : t
 
 val rec_info : t
@@ -112,6 +115,7 @@ module Boxable_number : sig
     | Naked_int32
     | Naked_int64
     | Naked_nativeint
+    | Naked_vec128
 
   val unboxed_kind : t -> kind
 
@@ -134,6 +138,7 @@ module With_subkind : sig
       | Boxed_int32
       | Boxed_int64
       | Boxed_nativeint
+      | Boxed_vec128
       | Tagged_immediate
       | Variant of
           { consts : Targetint_31_63.Set.t;
@@ -172,6 +177,8 @@ module With_subkind : sig
 
   val naked_nativeint : t
 
+  val naked_vec128 : t
+
   val region : t
 
   val boxed_float : t
@@ -182,6 +189,8 @@ module With_subkind : sig
 
   val boxed_nativeint : t
 
+  val boxed_vec128 : t
+
   val tagged_immediate : t
 
   val rec_info : t
diff --git a/middle_end/flambda2/numbers/numeric_types.ml b/middle_end/flambda2/numbers/numeric_types.ml
index 4894ebadf43..c6ef3377661 100644
--- a/middle_end/flambda2/numbers/numeric_types.ml
+++ b/middle_end/flambda2/numbers/numeric_types.ml
@@ -250,3 +250,65 @@ module Int64 = struct
 
   let cross_product = Pair.create_from_cross_product
 end
+
+module type Vector_width = sig
+  val size_in_int64s : int
+end
+
+module Vector_by_bit_pattern (Width : Vector_width) = struct
+  module T0 = struct
+    type t = Int64.t Array.t
+
+    let rec compare l r i =
+      if i = Width.size_in_int64s
+      then 0
+      else
+        let cmp = Int64.compare l.(i) r.(i) in
+        if cmp = 0 then compare l r (i + 1) else cmp
+
+    let compare l r = compare l r 0
+
+    let equal = Array.for_all2 Int64.equal
+
+    let hash v = Hashtbl.hash v
+
+    let print ppf t =
+      Format.pp_print_list
+        ~pp_sep:(fun ppf () -> Format.pp_print_char ppf ':')
+        (fun ppf i64 -> Format.fprintf ppf "%016Lx" i64)
+        ppf (Array.to_list t)
+  end
+
+  include T0
+  module Self = Container_types.Make (T0)
+  include Self
+
+  let zero = Array.init Width.size_in_int64s (fun _ -> 0L)
+
+  let to_int64_array t = t
+
+  let of_int64_array t =
+    if not (Array.length t = Width.size_in_int64s)
+    then
+      Misc.fatal_error
+        "Vector_by_bit_pattern.of_int64_array: wrong length array";
+    t
+end
+
+module Vec128_by_bit_pattern = struct
+  include Vector_by_bit_pattern (struct
+    let size_in_int64s = 2
+  end)
+
+  type bits =
+    { high : int64;
+      low : int64
+    }
+
+  let to_bits t =
+    match to_int64_array t with
+    | [| high; low |] -> { high; low }
+    | _ -> Misc.fatal_error "Vec128.to_bits: wrong size vector"
+
+  let of_bits { high; low } = of_int64_array [| high; low |]
+end
diff --git a/middle_end/flambda2/numbers/numeric_types.mli b/middle_end/flambda2/numbers/numeric_types.mli
index 7db0b4e9558..2ccbd662930 100644
--- a/middle_end/flambda2/numbers/numeric_types.mli
+++ b/middle_end/flambda2/numbers/numeric_types.mli
@@ -134,3 +134,21 @@ module Int64 : sig
 
   val cross_product : Set.t -> Set.t -> Pair.Set.t
 end
+
+module Vec128_by_bit_pattern : sig
+  (** 128-bit value whose comparison and equality relations are lexicographically
+      ordered by bit pattern. *)
+
+  include Container_types.S
+
+  val zero : t
+
+  type bits =
+    { high : int64;
+      low : int64
+    }
+
+  val to_bits : t -> bits
+
+  val of_bits : bits -> t
+end
diff --git a/middle_end/flambda2/parser/fexpr.ml b/middle_end/flambda2/parser/fexpr.ml
index 03395f4e2ef..1fd0b889764 100644
--- a/middle_end/flambda2/parser/fexpr.ml
+++ b/middle_end/flambda2/parser/fexpr.ml
@@ -57,6 +57,7 @@ type const =
   | Naked_float of float
   | Naked_int32 of int32
   | Naked_int64 of int64
+  | Naked_vec128 of Numeric_types.Vec128_by_bit_pattern.bits
   | Naked_nativeint of targetint
 
 type field_of_block =
@@ -89,6 +90,7 @@ type static_data =
   | Boxed_int32 of int32 or_variable
   | Boxed_int64 of int64 or_variable
   | Boxed_nativeint of targetint or_variable
+  | Boxed_vec128 of Numeric_types.Vec128_by_bit_pattern.bits or_variable
   | Immutable_float_block of float or_variable list
   | Immutable_float_array of float or_variable list
   | Immutable_value_array of field_of_block list
@@ -104,6 +106,7 @@ type subkind =
   | Boxed_int32
   | Boxed_int64
   | Boxed_nativeint
+  | Boxed_vec128
   | Tagged_immediate
   | Variant of
       { consts : targetint list;
@@ -178,6 +181,7 @@ type box_kind = Flambda_kind.Boxable_number.t =
   | Naked_int32
   | Naked_int64
   | Naked_nativeint
+  | Naked_vec128
 
 type generic_array_specialisation =
   | No_specialisation
diff --git a/middle_end/flambda2/parser/fexpr_to_flambda.ml b/middle_end/flambda2/parser/fexpr_to_flambda.ml
index cce7facc6dd..7d13c088030 100644
--- a/middle_end/flambda2/parser/fexpr_to_flambda.ml
+++ b/middle_end/flambda2/parser/fexpr_to_flambda.ml
@@ -238,6 +238,9 @@ let targetint (i : Fexpr.targetint) : Targetint_32_64.t =
 let targetint_31_63 (i : Fexpr.targetint) : Targetint_31_63.t =
   Targetint_31_63.of_int64 i
 
+let vec128 bits : Numeric_types.Vec128_by_bit_pattern.t =
+  Numeric_types.Vec128_by_bit_pattern.of_bits bits
+
 let tag_scannable (tag : Fexpr.tag_scannable) : Tag.Scannable.t =
   Tag.Scannable.create_exn tag
 
@@ -252,6 +255,7 @@ let rec subkind : Fexpr.subkind -> Flambda_kind.With_subkind.Subkind.t =
   | Boxed_int32 -> Boxed_int32
   | Boxed_int64 -> Boxed_int64
   | Boxed_nativeint -> Boxed_nativeint
+  | Boxed_vec128 -> Boxed_vec128
   | Tagged_immediate -> Tagged_immediate
   | Variant { consts; non_consts } ->
     let consts =
@@ -293,6 +297,7 @@ let const (c : Fexpr.const) : Reg_width_const.t =
   | Naked_int32 i -> Reg_width_const.naked_int32 i
   | Naked_int64 i -> Reg_width_const.naked_int64 i
   | Naked_nativeint i -> Reg_width_const.naked_nativeint (i |> targetint)
+  | Naked_vec128 bits -> Reg_width_const.naked_vec128 (bits |> vec128)
 
 let rec rec_info env (ri : Fexpr.rec_info) : Rec_info_expr.t =
   let module US = Rec_info_expr.Unrolling_state in
@@ -758,6 +763,8 @@ let rec expr env (e : Fexpr.expr) : Flambda.Expr.t =
           static_const (SC.boxed_int64 (or_variable Fun.id env i))
         | Boxed_nativeint i ->
           static_const (SC.boxed_nativeint (or_variable targetint env i))
+        | Boxed_vec128 i ->
+          static_const (SC.boxed_vec128 (or_variable vec128 env i))
         | Immutable_float_block elements ->
           static_const
             (SC.immutable_float_block
diff --git a/middle_end/flambda2/parser/flambda_parser.mli b/middle_end/flambda2/parser/flambda_parser.mli
index 97852cedadb..f1f876171ef 100644
--- a/middle_end/flambda2/parser/flambda_parser.mli
+++ b/middle_end/flambda2/parser/flambda_parser.mli
@@ -1,7 +1,7 @@
 
 (* The type of tokens. *)
 
-type token = 
+type token =
   | TILDEMINUS
   | TILDE
   | SYMBOL of (Fexpr.compilation_unit option * string)
@@ -185,20 +185,20 @@ val flambda_unit: (Lexing.lexbuf -> token) -> Lexing.lexbuf -> (Fexpr.flambda_un
 val expect_test_spec: (Lexing.lexbuf -> token) -> Lexing.lexbuf -> (Fexpr.expect_test_spec)
 
 module MenhirInterpreter : sig
-  
+
   (* The incremental API. *)
-  
+
   include CamlinternalMenhirLib.IncrementalEngine.INCREMENTAL_ENGINE
     with type token = token
-  
+
 end
 
 (* The entry point(s) to the incremental API. *)
 
 module Incremental : sig
-  
+
   val flambda_unit: Lexing.position -> (Fexpr.flambda_unit) MenhirInterpreter.checkpoint
-  
+
   val expect_test_spec: Lexing.position -> (Fexpr.expect_test_spec) MenhirInterpreter.checkpoint
-  
+
 end
diff --git a/middle_end/flambda2/parser/flambda_to_fexpr.ml b/middle_end/flambda2/parser/flambda_to_fexpr.ml
index 844ae908008..f1725cb35b7 100644
--- a/middle_end/flambda2/parser/flambda_to_fexpr.ml
+++ b/middle_end/flambda2/parser/flambda_to_fexpr.ml
@@ -354,6 +354,8 @@ let name env n =
 
 let float f = f |> Numeric_types.Float_by_bit_pattern.to_float
 
+let vec128 v = v |> Numeric_types.Vec128_by_bit_pattern.to_bits
+
 let targetint i = i |> Targetint_32_64.to_int64
 
 let const c : Fexpr.const =
@@ -367,6 +369,8 @@ let const c : Fexpr.const =
   | Naked_float f -> Naked_float (f |> float)
   | Naked_int32 i -> Naked_int32 i
   | Naked_int64 i -> Naked_int64 i
+  | Naked_vec128 i ->
+    Naked_vec128 (Numeric_types.Vec128_by_bit_pattern.to_bits i)
   | Naked_nativeint i -> Naked_nativeint (i |> targetint)
 
 let depth_or_infinity (d : int Or_infinity.t) : Fexpr.rec_info =
@@ -421,6 +425,7 @@ let rec subkind (k : Flambda_kind.With_subkind.Subkind.t) : Fexpr.subkind =
   | Boxed_int32 -> Boxed_int32
   | Boxed_int64 -> Boxed_int64
   | Boxed_nativeint -> Boxed_nativeint
+  | Boxed_vec128 -> Boxed_vec128
   | Tagged_immediate -> Tagged_immediate
   | Variant { consts; non_consts } -> variant_subkind consts non_consts
   | Float_array -> Float_array
@@ -689,6 +694,7 @@ let static_const env (sc : Static_const.t) : Fexpr.static_data =
   | Boxed_int32 i -> Boxed_int32 (or_variable Fun.id env i)
   | Boxed_int64 i -> Boxed_int64 (or_variable Fun.id env i)
   | Boxed_nativeint i -> Boxed_nativeint (or_variable targetint env i)
+  | Boxed_vec128 i -> Boxed_vec128 (or_variable vec128 env i)
   | Immutable_float_block elements ->
     Immutable_float_block (List.map (or_variable float env) elements)
   | Immutable_float_array elements ->
diff --git a/middle_end/flambda2/parser/print_fexpr.ml b/middle_end/flambda2/parser/print_fexpr.ml
index 316cbb68cdc..6371ccae309 100644
--- a/middle_end/flambda2/parser/print_fexpr.ml
+++ b/middle_end/flambda2/parser/print_fexpr.ml
@@ -132,6 +132,7 @@ let naked_number_kind ppf (nnk : Flambda_kind.Naked_number_kind.t) =
   | Naked_int32 -> "int32"
   | Naked_int64 -> "int64"
   | Naked_nativeint -> "nativeint"
+  | Naked_vec128 -> "vec128"
 
 let rec subkind ppf (k : subkind) =
   let str s = Format.pp_print_string ppf s in
@@ -142,6 +143,7 @@ let rec subkind ppf (k : subkind) =
   | Boxed_int32 -> str "int32 boxed"
   | Boxed_int64 -> str "int64 boxed"
   | Boxed_nativeint -> str "nativeint boxed"
+  | Boxed_vec128 -> str "vec128 boxed"
   | Variant { consts; non_consts } -> variant_subkind ppf consts non_consts
   | Tagged_immediate -> str "imm tagged"
   | Float_array -> str "float array"
@@ -255,6 +257,7 @@ let const ppf (c : Fexpr.const) =
   | Naked_int32 i -> Format.fprintf ppf "%lil" i
   | Naked_int64 i -> Format.fprintf ppf "%LiL" i
   | Naked_nativeint i -> Format.fprintf ppf "%Lin" i
+  | Naked_vec128 { high; low } -> Format.fprintf ppf "%016Lx:%016Lx" high low
 
 let rec simple ppf : simple -> unit = function
   | Symbol s -> symbol ppf s
@@ -319,10 +322,13 @@ let static_data ppf : static_data -> unit = function
   | Boxed_int32 (Const i) -> Format.fprintf ppf "%lil" i
   | Boxed_int64 (Const i) -> Format.fprintf ppf "%LiL" i
   | Boxed_nativeint (Const i) -> Format.fprintf ppf "%Lin" i
+  | Boxed_vec128 (Const { high; low }) ->
+    Format.fprintf ppf "%016Lx:%016Lx" high low
   | Boxed_float (Var v) -> boxed_variable ppf v ~kind:"float"
   | Boxed_int32 (Var v) -> boxed_variable ppf v ~kind:"int32"
   | Boxed_int64 (Var v) -> boxed_variable ppf v ~kind:"int64"
   | Boxed_nativeint (Var v) -> boxed_variable ppf v ~kind:"nativeint"
+  | Boxed_vec128 (Var v) -> boxed_variable ppf v ~kind:"vec128"
   | Immutable_float_block elements ->
     Format.fprintf ppf "Float_block (%a)"
       (pp_comma_list float_or_variable)
@@ -495,6 +501,7 @@ let unop ppf u =
     | Naked_int32 -> print verb_not_imm "int32"
     | Naked_int64 -> print verb_not_imm "int64"
     | Naked_nativeint -> print verb_not_imm "nativeint"
+    | Naked_vec128 -> print verb_not_imm "vec128"
   in
   match (u : unop) with
   | Array_length -> str "%array_length"
diff --git a/middle_end/flambda2/simplify/expr_builder.ml b/middle_end/flambda2/simplify/expr_builder.ml
index bd9e39ce827..354ea9cc71b 100644
--- a/middle_end/flambda2/simplify/expr_builder.ml
+++ b/middle_end/flambda2/simplify/expr_builder.ml
@@ -537,7 +537,7 @@ let create_let_symbols uacc lifted_constant ~body =
                 | Naked_number Naked_float -> Naked_floats { size = Unknown }
                 | Naked_number
                     ( Naked_immediate | Naked_nativeint | Naked_int32
-                    | Naked_int64 )
+                    | Naked_vec128 | Naked_int64 )
                 | Region | Rec_info ->
                   Misc.fatal_errorf
                     "Unexpected kind %a for symbol projection: %a"
diff --git a/middle_end/flambda2/simplify/lifting/reification.ml b/middle_end/flambda2/simplify/lifting/reification.ml
index 5cc00438b79..59b4ebb3a38 100644
--- a/middle_end/flambda2/simplify/lifting/reification.ml
+++ b/middle_end/flambda2/simplify/lifting/reification.ml
@@ -37,7 +37,7 @@ let create_static_const dacc dbg (to_lift : T.to_lift) : RSC.t =
             match Reg_width_const.descr const with
             | Tagged_immediate imm -> F.Tagged_immediate imm
             | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-            | Naked_nativeint _ ->
+            | Naked_vec128 _ | Naked_nativeint _ ->
               Misc.fatal_errorf
                 "Expected a constant of kind [Value] but got %a (dbg %a)"
                 Reg_width_const.print const Debuginfo.print_compact dbg))
@@ -54,6 +54,7 @@ let create_static_const dacc dbg (to_lift : T.to_lift) : RSC.t =
   | Boxed_int32 i -> RSC.create_boxed_int32 art (Const i)
   | Boxed_int64 i -> RSC.create_boxed_int64 art (Const i)
   | Boxed_nativeint i -> RSC.create_boxed_nativeint art (Const i)
+  | Boxed_vec128 v -> RSC.create_boxed_vec128 art (Const v)
   | Immutable_float_array { fields } ->
     let fields = List.map (fun f -> Or_variable.Const f) fields in
     RSC.create_immutable_float_array art fields
diff --git a/middle_end/flambda2/simplify/rebuilt_static_const.ml b/middle_end/flambda2/simplify/rebuilt_static_const.ml
index 3925d8bf9b0..9c7f7dc948b 100644
--- a/middle_end/flambda2/simplify/rebuilt_static_const.ml
+++ b/middle_end/flambda2/simplify/rebuilt_static_const.ml
@@ -126,6 +126,11 @@ let create_boxed_nativeint are_rebuilding or_var =
   then Block_not_rebuilt { free_names = Or_variable.free_names or_var }
   else create_normal_non_code (SC.boxed_nativeint or_var)
 
+let create_boxed_vec128 are_rebuilding or_var =
+  if ART.do_not_rebuild_terms are_rebuilding
+  then Block_not_rebuilt { free_names = Or_variable.free_names or_var }
+  else create_normal_non_code (SC.boxed_vec128 or_var)
+
 let create_immutable_float_block are_rebuilding fields =
   if ART.do_not_rebuild_terms are_rebuilding
   then
@@ -190,7 +195,7 @@ let map_set_of_closures t ~f =
                 (SC.set_of_closures set_of_closures);
             free_names = Set_of_closures.free_names set_of_closures
           }
-      | Block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _
+      | Block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _ | Boxed_vec128 _
       | Boxed_nativeint _ | Immutable_float_block _ | Immutable_float_array _
       | Immutable_value_array _ | Empty_array | Mutable_string _
       | Immutable_string _ ->
diff --git a/middle_end/flambda2/simplify/rebuilt_static_const.mli b/middle_end/flambda2/simplify/rebuilt_static_const.mli
index 03f487f5ebe..790f4796d87 100644
--- a/middle_end/flambda2/simplify/rebuilt_static_const.mli
+++ b/middle_end/flambda2/simplify/rebuilt_static_const.mli
@@ -59,6 +59,11 @@ val create_boxed_int64 : Are_rebuilding_terms.t -> Int64.t Or_variable.t -> t
 val create_boxed_nativeint :
   Are_rebuilding_terms.t -> Targetint_32_64.t Or_variable.t -> t
 
+val create_boxed_vec128 :
+  Are_rebuilding_terms.t ->
+  Numeric_types.Vec128_by_bit_pattern.t Or_variable.t ->
+  t
+
 val create_immutable_float_block :
   Are_rebuilding_terms.t ->
   Numeric_types.Float_by_bit_pattern.t Or_variable.t list ->
diff --git a/middle_end/flambda2/simplify/simplify_binary_primitive.ml b/middle_end/flambda2/simplify/simplify_binary_primitive.ml
index 10a95033e41..627fafea315 100644
--- a/middle_end/flambda2/simplify/simplify_binary_primitive.ml
+++ b/middle_end/flambda2/simplify/simplify_binary_primitive.ml
@@ -923,7 +923,7 @@ let simplify_immutable_block_load access_kind ~min_name_mode dacc ~original_term
             ~projection_bound_to:result_var
             ~kind:Flambda_kind.With_subkind.tagged_immediate
         | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-        | Naked_nativeint _ ->
+        | Naked_nativeint _ | Naked_vec128 _ ->
           Misc.fatal_errorf "Kind error for [Block_load] of %a at index %a"
             Simple.print arg1 Simple.print arg2)
       ~name:(fun _ ~coercion:_ -> dacc)
diff --git a/middle_end/flambda2/simplify/simplify_extcall.ml b/middle_end/flambda2/simplify/simplify_extcall.ml
index e8560098315..ba9f27d189f 100644
--- a/middle_end/flambda2/simplify/simplify_extcall.ml
+++ b/middle_end/flambda2/simplify/simplify_extcall.ml
@@ -111,7 +111,10 @@ let simplify_comparison ~dbg ~dacc ~cont ~tagged_prim ~float_prim
   | Proved (Boxed _), Proved Tagged_immediate
   | ( Proved
         (Boxed
-          (_, (Naked_float | Naked_int32 | Naked_int64 | Naked_nativeint), _)),
+          ( _,
+            ( Naked_float | Naked_int32 | Naked_int64 | Naked_nativeint
+            | Naked_vec128 ),
+            _ )),
       Proved (Boxed _) )
   (* One or two of the arguments is not known *)
   | Unknown, Unknown
diff --git a/middle_end/flambda2/simplify/simplify_static_const.ml b/middle_end/flambda2/simplify/simplify_static_const.ml
index 99afdb19e80..d84d4ff423c 100644
--- a/middle_end/flambda2/simplify/simplify_static_const.ml
+++ b/middle_end/flambda2/simplify/simplify_static_const.ml
@@ -37,7 +37,7 @@ let simplify_field_of_block dacc (field : Field_of_static_block.t) =
         match Reg_width_const.descr const with
         | Tagged_immediate imm -> Field_of_static_block.Tagged_immediate imm, ty
         | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-        | Naked_nativeint _ ->
+        | Naked_nativeint _ | Naked_vec128 _ ->
           (* CR mshinwell: This should be "invalid" and propagate up *)
           field, ty)
 
@@ -125,6 +125,17 @@ let simplify_static_const_of_kind_value dacc (static_const : Static_const.t)
         (DA.are_rebuilding_terms dacc)
         or_var,
       dacc )
+  | Boxed_vec128 or_var ->
+    let or_var, ty =
+      simplify_or_variable dacc
+        (fun f -> T.this_boxed_vec128 f Alloc_mode.For_types.heap)
+        or_var K.value
+    in
+    let dacc = bind_result_sym ty in
+    ( Rebuilt_static_const.create_boxed_vec128
+        (DA.are_rebuilding_terms dacc)
+        or_var,
+      dacc )
   | Immutable_float_block fields ->
     let fields_with_tys =
       List.map
diff --git a/middle_end/flambda2/simplify/simplify_switch_expr.ml b/middle_end/flambda2/simplify/simplify_switch_expr.ml
index 0ef1827bd8d..4e9e7a16716 100644
--- a/middle_end/flambda2/simplify/simplify_switch_expr.ml
+++ b/middle_end/flambda2/simplify/simplify_switch_expr.ml
@@ -180,7 +180,7 @@ let rebuild_arm uacc arm (action, use_id, arity, env_at_use)
               maybe_mergeable ~mergeable_arms ~identity_arms ~not_arms
             else maybe_mergeable ~mergeable_arms ~identity_arms ~not_arms
           | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-          | Naked_nativeint _ ->
+          | Naked_vec128 _ | Naked_nativeint _ ->
             maybe_mergeable ~mergeable_arms ~identity_arms ~not_arms
         in
         Simple.pattern_match arg ~const ~name:(fun _ ~coercion:_ ->
diff --git a/middle_end/flambda2/simplify/simplify_unary_primitive.ml b/middle_end/flambda2/simplify/simplify_unary_primitive.ml
index 765d0694079..3c07c9019b3 100644
--- a/middle_end/flambda2/simplify/simplify_unary_primitive.ml
+++ b/middle_end/flambda2/simplify/simplify_unary_primitive.ml
@@ -110,6 +110,10 @@ let simplify_unbox_number (boxable_number_kind : K.Boxable_number.t) dacc
       ( T.boxed_nativeint_alias_to ~naked_nativeint:result_var'
           (Alloc_mode.For_types.unknown ()),
         K.naked_nativeint )
+    | Naked_vec128 ->
+      ( T.boxed_vec128_alias_to ~naked_vec128:result_var'
+          (Alloc_mode.For_types.unknown ()),
+        K.naked_vec128 )
   in
   let alloc_mode =
     T.prove_alloc_mode_of_boxed_number (DA.typing_env dacc) boxed_number_ty
@@ -165,6 +169,7 @@ let simplify_box_number (boxable_number_kind : K.Boxable_number.t) alloc_mode
     | Naked_int32 -> T.box_int32 naked_number_ty alloc_mode
     | Naked_int64 -> T.box_int64 naked_number_ty alloc_mode
     | Naked_nativeint -> T.box_nativeint naked_number_ty alloc_mode
+    | Naked_vec128 -> T.box_vec128 naked_number_ty alloc_mode
   in
   let dacc = DA.add_variable dacc result_var ty in
   SPR.create original_term ~try_reify:true dacc
@@ -569,6 +574,7 @@ let simplify_obj_dup dbg dacc ~original_term ~arg ~arg_ty ~result_var =
       | Naked_int32 -> T.box_int32
       | Naked_int64 -> T.box_int64
       | Naked_nativeint -> T.box_nativeint
+      | Naked_vec128 -> T.box_vec128
     in
     let ty = boxer contents_ty Alloc_mode.For_types.heap in
     let dacc = DA.add_variable dacc result_var ty in
diff --git a/middle_end/flambda2/simplify/unboxing/build_unboxing_denv.ml b/middle_end/flambda2/simplify/unboxing/build_unboxing_denv.ml
index 343c2bed584..8ab1b7511fd 100644
--- a/middle_end/flambda2/simplify/unboxing/build_unboxing_denv.ml
+++ b/middle_end/flambda2/simplify/unboxing/build_unboxing_denv.ml
@@ -130,7 +130,8 @@ let rec denv_of_decision denv ~param_var (decision : U.decision) : DE.t =
               Unbox
                 ( Unique_tag_and_size _ | Variant _ | Closure_single_entry _
                 | Number
-                    ( (Naked_float | Naked_int32 | Naked_int64 | Naked_nativeint),
+                    ( ( Naked_float | Naked_int32 | Naked_int64
+                      | Naked_nativeint | Naked_vec128 ),
                       _ ) );
             is_int = _
           } ->
@@ -195,3 +196,8 @@ let rec denv_of_decision denv ~param_var (decision : U.decision) : DE.t =
     in
     denv_of_number_decision K.naked_nativeint shape param_var naked_nativeint
       denv
+  | Unbox (Number (Naked_vec128, { param = naked_vec128; args = _ })) ->
+    let shape =
+      T.boxed_vec128_alias_to ~naked_vec128 (Alloc_mode.For_types.unknown ())
+    in
+    denv_of_number_decision K.naked_vec128 shape param_var naked_vec128 denv
diff --git a/middle_end/flambda2/simplify/unboxing/is_unboxing_beneficial.ml b/middle_end/flambda2/simplify/unboxing/is_unboxing_beneficial.ml
index c30062be732..0f1827e02c4 100644
--- a/middle_end/flambda2/simplify/unboxing/is_unboxing_beneficial.ml
+++ b/middle_end/flambda2/simplify/unboxing/is_unboxing_beneficial.ml
@@ -79,8 +79,9 @@ let rec filter_non_beneficial_decisions decision : U.decision =
     decision
   | Unbox
       (Number
-        ((Naked_float | Naked_int32 | Naked_int64 | Naked_nativeint), epa)) as
-    decision ->
+        ( ( Naked_float | Naked_int32 | Naked_int64 | Naked_nativeint
+          | Naked_vec128 ),
+          epa )) as decision ->
     if is_unboxing_beneficial_for_epa epa
     then decision
     else Do_not_unbox Not_beneficial
diff --git a/middle_end/flambda2/simplify/unboxing/optimistic_unboxing_decision.ml b/middle_end/flambda2/simplify/unboxing/optimistic_unboxing_decision.ml
index 11ecfe8e663..70f0e8a2a08 100644
--- a/middle_end/flambda2/simplify/unboxing/optimistic_unboxing_decision.ml
+++ b/middle_end/flambda2/simplify/unboxing/optimistic_unboxing_decision.ml
@@ -54,7 +54,8 @@ let deciders =
     Unboxers.Float.decider;
     Unboxers.Int32.decider;
     Unboxers.Int64.decider;
-    Unboxers.Nativeint.decider ]
+    Unboxers.Nativeint.decider;
+    Unboxers.Vec128.decider ]
 
 let rec make_optimistic_decision ~depth ~recursive tenv ~param_type : U.decision
     =
diff --git a/middle_end/flambda2/simplify/unboxing/unboxers.ml b/middle_end/flambda2/simplify/unboxing/unboxers.ml
index de4b628eec5..fddb5846084 100644
--- a/middle_end/flambda2/simplify/unboxing/unboxers.ml
+++ b/middle_end/flambda2/simplify/unboxing/unboxers.ml
@@ -123,6 +123,24 @@ module Nativeint = struct
     }
 end
 
+module Vec128 = struct
+  let decider =
+    { param_name = "unboxed_vec128";
+      kind = K.Naked_number_kind.Naked_vec128;
+      prove_is_a_boxed_number = T.prove_is_a_boxed_vec128
+    }
+
+  let unboxing_prim simple = P.(Unary (Unbox_number Naked_vec128, simple))
+
+  let unboxer =
+    { var_name = "unboxed_vec128";
+      invalid_const =
+        Const.naked_vec128 Numeric_types.Vec128_by_bit_pattern.zero;
+      unboxing_prim;
+      prove_simple = T.meet_boxed_vec128_containing_simple
+    }
+end
+
 module Field = struct
   let unboxing_prim bak ~block ~index =
     let field_const = Simple.const (Const.tagged_immediate index) in
diff --git a/middle_end/flambda2/simplify/unboxing/unboxers.mli b/middle_end/flambda2/simplify/unboxing/unboxers.mli
index bfa61f223c4..096a5530c0c 100644
--- a/middle_end/flambda2/simplify/unboxing/unboxers.mli
+++ b/middle_end/flambda2/simplify/unboxing/unboxers.mli
@@ -48,6 +48,8 @@ module Int64 : Number_S
 
 module Nativeint : Number_S
 
+module Vec128 : Number_S
+
 module Field : sig
   val unboxing_prim :
     P.Block_access_kind.t -> block:Simple.t -> index:Targetint_31_63.t -> P.t
diff --git a/middle_end/flambda2/simplify/unboxing/unboxing_epa.ml b/middle_end/flambda2/simplify/unboxing/unboxing_epa.ml
index ad9dbc3052f..cccf90e9ae9 100644
--- a/middle_end/flambda2/simplify/unboxing/unboxing_epa.ml
+++ b/middle_end/flambda2/simplify/unboxing/unboxing_epa.ml
@@ -159,8 +159,9 @@ let extra_args_for_const_ctor_of_variant
           Unbox
             ( Unique_tag_and_size _ | Variant _ | Closure_single_entry _
             | Number
-                ((Naked_float | Naked_int32 | Naked_int64 | Naked_nativeint), _)
-              );
+                ( ( Naked_float | Naked_int32 | Naked_int64 | Naked_nativeint
+                  | Naked_vec128 ),
+                  _ ) );
         is_int = _
       } ->
     Misc.fatal_errorf
@@ -241,6 +242,9 @@ and compute_extra_args_for_one_decision_and_use_aux ~(pass : U.pass) rewrite_id
   | Unbox (Number (Naked_immediate, epa)) ->
     compute_extra_arg_for_number Naked_immediate Unboxers.Immediate.unboxer epa
       rewrite_id ~typing_env_at_use arg_being_unboxed
+  | Unbox (Number (Naked_vec128, epa)) ->
+    compute_extra_arg_for_number Naked_vec128 Unboxers.Vec128.unboxer epa
+      rewrite_id ~typing_env_at_use arg_being_unboxed
 
 and compute_extra_args_for_block ~pass rewrite_id ~typing_env_at_use
     arg_being_unboxed tag fields : U.decision =
@@ -460,7 +464,7 @@ let add_extra_params_and_args extra_params_and_args decision =
                 Unbox
                   ( Unique_tag_and_size _ | Variant _ | Closure_single_entry _
                   | Number
-                      ( ( Naked_float | Naked_int32 | Naked_int64
+                      ( ( Naked_float | Naked_int32 | Naked_int64 | Naked_vec128
                         | Naked_nativeint ),
                         _ ) );
               is_int = _
diff --git a/middle_end/flambda2/terms/code_size.ml b/middle_end/flambda2/terms/code_size.ml
index 4c3e8eb612a..4abf165f9d5 100644
--- a/middle_end/flambda2/terms/code_size.ml
+++ b/middle_end/flambda2/terms/code_size.ml
@@ -120,14 +120,14 @@ let arith_conversion_size src dst =
 
 let unbox_number kind =
   match (kind : Flambda_kind.Boxable_number.t) with
-  | Naked_float -> 1 (* 1 load *)
+  | Naked_float | Naked_vec128 -> 1 (* 1 load *)
   | Naked_int64 when arch32 -> 4 (* 2 Cadda + 2 loads *)
   | Naked_int32 | Naked_int64 | Naked_nativeint -> 2
 (* Cadda + load *)
 
 let box_number kind =
   match (kind : Flambda_kind.Boxable_number.t) with
-  | Naked_float -> alloc_size (* 1 alloc *)
+  | Naked_float | Naked_vec128 -> alloc_size (* 1 alloc *)
   | Naked_int32 when not arch32 -> 1 + alloc_size (* shift/sextend + alloc *)
   | Naked_int32 | Naked_int64 | Naked_nativeint -> alloc_size
 (* alloc *)
diff --git a/middle_end/flambda2/terms/flambda.ml b/middle_end/flambda2/terms/flambda.ml
index b64f66d73e2..f0920c58a0f 100644
--- a/middle_end/flambda2/terms/flambda.ml
+++ b/middle_end/flambda2/terms/flambda.ml
@@ -1379,6 +1379,9 @@ module Named = struct
         Simple.const (Reg_width_const.naked_int64 Int64.zero)
       | Naked_number Naked_nativeint ->
         Simple.const (Reg_width_const.naked_nativeint Targetint_32_64.zero)
+      | Naked_number Naked_vec128 ->
+        Simple.const
+          (Reg_width_const.naked_vec128 Numeric_types.Vec128_by_bit_pattern.zero)
       | Region -> Misc.fatal_error "[Region] kind not expected here"
       | Rec_info -> Misc.fatal_error "[Rec_info] kind not expected here"
     in
@@ -1410,7 +1413,7 @@ module Named = struct
              | Deleted_code
              | Static_const
                  ( Block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _
-                 | Boxed_nativeint _ | Immutable_float_block _
+                 | Boxed_vec128 _ | Boxed_nativeint _ | Immutable_float_block _
                  | Immutable_float_array _ | Mutable_string _
                  | Immutable_string _ | Empty_array | Immutable_value_array _ )
                ->
diff --git a/middle_end/flambda2/terms/static_const.ml b/middle_end/flambda2/terms/static_const.ml
index 2a02b89f1b8..21be9fc8236 100644
--- a/middle_end/flambda2/terms/static_const.ml
+++ b/middle_end/flambda2/terms/static_const.ml
@@ -23,6 +23,7 @@ type t =
   | Boxed_int32 of Int32.t Or_variable.t
   | Boxed_int64 of Int64.t Or_variable.t
   | Boxed_nativeint of Targetint_32_64.t Or_variable.t
+  | Boxed_vec128 of Numeric_types.Vec128_by_bit_pattern.t Or_variable.t
   | Immutable_float_block of
       Numeric_types.Float_by_bit_pattern.t Or_variable.t list
   | Immutable_float_array of
@@ -44,6 +45,8 @@ let boxed_int64 or_var = Boxed_int64 or_var
 
 let boxed_nativeint or_var = Boxed_nativeint or_var
 
+let boxed_vec128 or_var = Boxed_vec128 or_var
+
 let immutable_float_block fields = Immutable_float_block fields
 
 let immutable_float_array fields =
@@ -96,6 +99,11 @@ let [@ocamlformat "disable"] print ppf t =
       Flambda_colours.static_part
       Flambda_colours.pop
       (Or_variable.print Targetint_32_64.print) or_var
+  | Boxed_vec128 or_var ->
+    fprintf ppf "@[<hov 1>(%tBoxed_vec128%t@ %a)@]"
+      Flambda_colours.static_part
+      Flambda_colours.pop
+      (Or_variable.print Numeric_types.Vec128_by_bit_pattern.print) or_var
   | Immutable_float_block fields ->
     fprintf ppf "@[<hov 1>(%tImmutable_float_block%t@ @[[| %a |]@])@]"
       Flambda_colours.static_part
@@ -161,6 +169,9 @@ include Container_types.Make (struct
       Or_variable.compare Numeric_types.Int64.compare or_var1 or_var2
     | Boxed_nativeint or_var1, Boxed_nativeint or_var2 ->
       Or_variable.compare Targetint_32_64.compare or_var1 or_var2
+    | Boxed_vec128 or_var1, Boxed_vec128 or_var2 ->
+      Or_variable.compare Numeric_types.Vec128_by_bit_pattern.compare or_var1
+        or_var2
     | Immutable_float_block fields1, Immutable_float_array fields2 ->
       Misc.Stdlib.List.compare
         (Or_variable.compare Numeric_types.Float_by_bit_pattern.compare)
@@ -188,6 +199,8 @@ include Container_types.Make (struct
     | _, Boxed_int64 _ -> 1
     | Boxed_nativeint _, _ -> -1
     | _, Boxed_nativeint _ -> 1
+    | Boxed_vec128 _, _ -> -1
+    | _, Boxed_vec128 _ -> 1
     | Immutable_float_block _, _ -> -1
     | _, Immutable_float_block _ -> 1
     | Immutable_float_array _, _ -> -1
@@ -218,6 +231,7 @@ let free_names t =
   | Boxed_int32 or_var -> Or_variable.free_names or_var
   | Boxed_int64 or_var -> Or_variable.free_names or_var
   | Boxed_nativeint or_var -> Or_variable.free_names or_var
+  | Boxed_vec128 or_var -> Or_variable.free_names or_var
   | Mutable_string { initial_value = _ } | Immutable_string _ | Empty_array ->
     Name_occurrences.empty
   | Immutable_float_block fields | Immutable_float_array fields ->
@@ -256,6 +270,9 @@ let apply_renaming t renaming =
     | Boxed_nativeint or_var ->
       let or_var' = Or_variable.apply_renaming or_var renaming in
       if or_var == or_var' then t else Boxed_nativeint or_var'
+    | Boxed_vec128 or_var ->
+      let or_var' = Or_variable.apply_renaming or_var renaming in
+      if or_var == or_var' then t else Boxed_vec128 or_var'
     | Mutable_string { initial_value = _ } | Immutable_string _ -> t
     | Immutable_float_block fields ->
       let fields' =
@@ -303,12 +320,14 @@ let ids_for_export t =
   | Boxed_float (Var (var, _dbg))
   | Boxed_int32 (Var (var, _dbg))
   | Boxed_int64 (Var (var, _dbg))
-  | Boxed_nativeint (Var (var, _dbg)) ->
+  | Boxed_nativeint (Var (var, _dbg))
+  | Boxed_vec128 (Var (var, _dbg)) ->
     Ids_for_export.add_variable Ids_for_export.empty var
   | Boxed_float (Const _)
   | Boxed_int32 (Const _)
   | Boxed_int64 (Const _)
   | Boxed_nativeint (Const _)
+  | Boxed_vec128 (Const _)
   | Mutable_string { initial_value = _ }
   | Immutable_string _ ->
     Ids_for_export.empty
@@ -332,8 +351,9 @@ let ids_for_export t =
 let is_block t =
   match t with
   | Block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _ | Boxed_nativeint _
-  | Immutable_float_block _ | Immutable_float_array _ | Immutable_string _
-  | Mutable_string _ | Empty_array | Immutable_value_array _ ->
+  | Boxed_vec128 _ | Immutable_float_block _ | Immutable_float_array _
+  | Immutable_string _ | Mutable_string _ | Empty_array
+  | Immutable_value_array _ ->
     true
   | Set_of_closures _ -> false
 
@@ -341,8 +361,9 @@ let is_set_of_closures t =
   match t with
   | Set_of_closures _ -> true
   | Block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _ | Boxed_nativeint _
-  | Immutable_float_block _ | Immutable_float_array _ | Immutable_string _
-  | Mutable_string _ | Empty_array | Immutable_value_array _ ->
+  | Boxed_vec128 _ | Immutable_float_block _ | Immutable_float_array _
+  | Immutable_string _ | Mutable_string _ | Empty_array
+  | Immutable_value_array _ ->
     false
 
 let is_fully_static t = free_names t |> Name_occurrences.no_variables
@@ -351,8 +372,9 @@ let can_share0 t =
   match t with
   | Block (_, Immutable, _)
   | Set_of_closures _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _
-  | Boxed_nativeint _ | Immutable_float_block _ | Immutable_float_array _
-  | Immutable_string _ | Empty_array | Immutable_value_array _ ->
+  | Boxed_vec128 _ | Boxed_nativeint _ | Immutable_float_block _
+  | Immutable_float_array _ | Immutable_string _ | Empty_array
+  | Immutable_value_array _ ->
     true
   | Block (_, (Mutable | Immutable_unique), _) | Mutable_string _ -> false
 
@@ -362,8 +384,9 @@ let must_be_set_of_closures t =
   match t with
   | Set_of_closures set -> set
   | Block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _ | Boxed_nativeint _
-  | Immutable_float_block _ | Immutable_float_array _ | Empty_array
-  | Immutable_value_array _ | Immutable_string _ | Mutable_string _ ->
+  | Boxed_vec128 _ | Immutable_float_block _ | Immutable_float_array _
+  | Empty_array | Immutable_value_array _ | Immutable_string _
+  | Mutable_string _ ->
     Misc.fatal_errorf "Not a set of closures:@ %a" print t
 
 let match_against_bound_static_pattern t (pat : Bound_static.Pattern.t)
@@ -386,14 +409,14 @@ let match_against_bound_static_pattern t (pat : Bound_static.Pattern.t)
       Misc.fatal_errorf "Mismatch on declared function slots:@ %a@ =@ %a"
         Bound_static.Pattern.print pat print t;
     set_of_closures_callback ~closure_symbols set_of_closures
-  | ( ( Block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _
+  | ( ( Block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _ | Boxed_vec128 _
       | Boxed_nativeint _ | Immutable_float_block _ | Immutable_float_array _
       | Immutable_value_array _ | Empty_array | Immutable_string _
       | Mutable_string _ ),
       Block_like symbol ) ->
     block_like_callback symbol t
   | Set_of_closures _, (Block_like _ | Code _)
-  | ( ( Block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _
+  | ( ( Block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _ | Boxed_vec128 _
       | Boxed_nativeint _ | Immutable_float_block _ | Immutable_float_array _
       | Immutable_value_array _ | Empty_array | Immutable_string _
       | Mutable_string _ ),
diff --git a/middle_end/flambda2/terms/static_const.mli b/middle_end/flambda2/terms/static_const.mli
index 17950a648b3..89cf4bd78c1 100644
--- a/middle_end/flambda2/terms/static_const.mli
+++ b/middle_end/flambda2/terms/static_const.mli
@@ -29,6 +29,7 @@ type t = private
   | Boxed_int32 of Int32.t Or_variable.t
   | Boxed_int64 of Int64.t Or_variable.t
   | Boxed_nativeint of Targetint_32_64.t Or_variable.t
+  | Boxed_vec128 of Numeric_types.Vec128_by_bit_pattern.t Or_variable.t
   | Immutable_float_block of
       Numeric_types.Float_by_bit_pattern.t Or_variable.t list
   | Immutable_float_array of
@@ -62,6 +63,8 @@ val boxed_int64 : Int64.t Or_variable.t -> t
 
 val boxed_nativeint : Targetint_32_64.t Or_variable.t -> t
 
+val boxed_vec128 : Numeric_types.Vec128_by_bit_pattern.t Or_variable.t -> t
+
 val immutable_float_block :
   Numeric_types.Float_by_bit_pattern.t Or_variable.t list -> t
 
diff --git a/middle_end/flambda2/to_cmm/to_cmm_env.ml b/middle_end/flambda2/to_cmm/to_cmm_env.ml
index 36b67dbd886..4f774228e71 100644
--- a/middle_end/flambda2/to_cmm/to_cmm_env.ml
+++ b/middle_end/flambda2/to_cmm/to_cmm_env.ml
@@ -394,8 +394,8 @@ let splittable_primitive dbg prim args = Splittable_prim { dbg; prim; args }
 
 let is_cmm_simple cmm =
   match (cmm : Cmm.expression) with
-  | Cconst_int _ | Cconst_natint _ | Cconst_float _ | Cconst_symbol _ | Cvar _
-    ->
+  | Cconst_int _ | Cconst_natint _ | Cconst_float _ | Cconst_vec128 _
+  | Cconst_symbol _ | Cvar _ ->
     true
   | Clet _ | Clet_mut _ | Cphantom_let _ | Cassign _ | Ctuple _ | Cop _
   | Csequence _ | Cifthenelse _ | Cswitch _ | Ccatch _ | Cexit _ | Ctrywith _
diff --git a/middle_end/flambda2/to_cmm/to_cmm_expr.ml b/middle_end/flambda2/to_cmm/to_cmm_expr.ml
index fa5c7051fc0..6e2deec94e0 100644
--- a/middle_end/flambda2/to_cmm/to_cmm_expr.ml
+++ b/middle_end/flambda2/to_cmm/to_cmm_expr.ml
@@ -186,7 +186,8 @@ let translate_apply0 ~dbg_with_inlined:dbg env res apply =
         match Flambda_kind.With_subkind.kind kind with
         | Naked_number Naked_int32 -> C.sign_extend_32
         | Naked_number
-            (Naked_float | Naked_immediate | Naked_int64 | Naked_nativeint)
+            ( Naked_float | Naked_immediate | Naked_int64 | Naked_nativeint
+            | Naked_vec128 )
         | Value | Rec_info | Region ->
           fun _dbg cmm -> cmm)
       | _ ->
@@ -629,6 +630,7 @@ and let_cont_exn_handler env res k body vars handler free_vars_of_handler
           | Naked_number
               (Naked_immediate | Naked_int32 | Naked_int64 | Naked_nativeint) ->
             C.int ~dbg 0
+          | Naked_number Naked_vec128 -> C.vec128 ~dbg { high = 0L; low = 0L }
           | Region | Rec_info ->
             Misc.fatal_errorf "No dummy value available for kind %a"
               K.With_subkind.print kind
diff --git a/middle_end/flambda2/to_cmm/to_cmm_primitive.ml b/middle_end/flambda2/to_cmm/to_cmm_primitive.ml
index 705e1f45616..165d166309a 100644
--- a/middle_end/flambda2/to_cmm/to_cmm_primitive.ml
+++ b/middle_end/flambda2/to_cmm/to_cmm_primitive.ml
@@ -50,6 +50,7 @@ let value_slot_offset env value_slot =
 let unbox_number ~dbg kind arg =
   match (kind : K.Boxable_number.t) with
   | Naked_float -> C.unbox_float dbg arg
+  | Naked_vec128 -> C.unbox_vector dbg Pvec128 arg
   | Naked_int32 | Naked_int64 | Naked_nativeint ->
     let primitive_kind = K.Boxable_number.primitive_kind kind in
     C.unbox_int dbg primitive_kind arg
@@ -58,6 +59,7 @@ let box_number ~dbg kind alloc_mode arg =
   let alloc_mode = Alloc_mode.For_allocations.to_lambda alloc_mode in
   match (kind : K.Boxable_number.t) with
   | Naked_float -> C.box_float dbg alloc_mode arg
+  | Naked_vec128 -> C.box_vector dbg Pvec128 alloc_mode arg
   | Naked_int32 | Naked_int64 | Naked_nativeint ->
     let primitive_kind = K.Boxable_number.primitive_kind kind in
     C.box_int_gen dbg primitive_kind alloc_mode arg
diff --git a/middle_end/flambda2/to_cmm/to_cmm_result.ml b/middle_end/flambda2/to_cmm/to_cmm_result.ml
index eac1d30a973..2347830a71e 100644
--- a/middle_end/flambda2/to_cmm/to_cmm_result.ml
+++ b/middle_end/flambda2/to_cmm/to_cmm_result.ml
@@ -95,7 +95,7 @@ let check_for_module_symbol t symbol =
 let defines_a_symbol data =
   match (data : Cmm.data_item) with
   | Cdefine_symbol _ -> true
-  | Cint8 _ | Cint16 _ | Cint32 _ | Cint _ | Csingle _ | Cdouble _
+  | Cint8 _ | Cint16 _ | Cint32 _ | Cint _ | Csingle _ | Cdouble _ | Cvec128 _
   | Csymbol_address _ | Cstring _ | Cskip _ | Calign _ ->
     false
 
diff --git a/middle_end/flambda2/to_cmm/to_cmm_shared.ml b/middle_end/flambda2/to_cmm/to_cmm_shared.ml
index 9b6a239d885..c2311f6cd4f 100644
--- a/middle_end/flambda2/to_cmm/to_cmm_shared.ml
+++ b/middle_end/flambda2/to_cmm/to_cmm_shared.ml
@@ -36,6 +36,7 @@ let exttype_of_kind (k : Flambda_kind.t) : Cmm.exttype =
     match Targetint_32_64.num_bits with
     | Thirty_two -> XInt32
     | Sixty_four -> XInt64)
+  | Naked_number Naked_vec128 -> XVec128
   | Region -> Misc.fatal_error "[Region] kind not expected here"
   | Rec_info -> Misc.fatal_error "[Rec_info] kind not expected here"
 
@@ -45,11 +46,12 @@ let machtype_of_kind (kind : Flambda_kind.With_subkind.t) =
     match Flambda_kind.With_subkind.subkind kind with
     | Tagged_immediate -> Cmm.typ_int
     | Anything | Boxed_float | Boxed_int32 | Boxed_int64 | Boxed_nativeint
-    | Variant _ | Float_block _ | Float_array | Immediate_array | Value_array
-    | Generic_array ->
+    | Boxed_vec128 | Variant _ | Float_block _ | Float_array | Immediate_array
+    | Value_array | Generic_array ->
       Cmm.typ_val)
   | Naked_number Naked_float -> Cmm.typ_float
   | Naked_number Naked_int64 -> typ_int64
+  | Naked_number Naked_vec128 -> Cmm.typ_vec128
   | Naked_number (Naked_immediate | Naked_int32 | Naked_nativeint) ->
     Cmm.typ_int
   | Region | Rec_info -> assert false
@@ -60,11 +62,12 @@ let extended_machtype_of_kind (kind : Flambda_kind.With_subkind.t) =
     match Flambda_kind.With_subkind.subkind kind with
     | Tagged_immediate -> Extended_machtype.typ_tagged_int
     | Anything | Boxed_float | Boxed_int32 | Boxed_int64 | Boxed_nativeint
-    | Variant _ | Float_block _ | Float_array | Immediate_array | Value_array
-    | Generic_array ->
+    | Boxed_vec128 | Variant _ | Float_block _ | Float_array | Immediate_array
+    | Value_array | Generic_array ->
       Extended_machtype.typ_val)
   | Naked_number Naked_float -> Extended_machtype.typ_float
   | Naked_number Naked_int64 -> Extended_machtype.typ_int64
+  | Naked_number Naked_vec128 -> Extended_machtype.typ_vec128
   | Naked_number (Naked_immediate | Naked_int32 | Naked_nativeint) ->
     Extended_machtype.typ_any_int
   | Region | Rec_info -> assert false
@@ -76,13 +79,14 @@ let memory_chunk_of_kind (kind : Flambda_kind.With_subkind.t) : Cmm.memory_chunk
     match Flambda_kind.With_subkind.subkind kind with
     | Tagged_immediate -> Word_int
     | Anything | Boxed_float | Boxed_int32 | Boxed_int64 | Boxed_nativeint
-    | Variant _ | Float_block _ | Float_array | Immediate_array | Value_array
-    | Generic_array ->
+    | Boxed_vec128 | Variant _ | Float_block _ | Float_array | Immediate_array
+    | Value_array | Generic_array ->
       Word_val)
   | Naked_number (Naked_int32 | Naked_int64 | Naked_nativeint | Naked_immediate)
     ->
     Word_int
   | Naked_number Naked_float -> Double
+  | Naked_number Naked_vec128 -> Onetwentyeight
   | Region | Rec_info ->
     Misc.fatal_errorf "Bad kind %a for [memory_chunk_of_kind]"
       Flambda_kind.With_subkind.print kind
@@ -131,6 +135,11 @@ let const ~dbg cst =
   | Naked_float f -> float ~dbg (Numeric_types.Float_by_bit_pattern.to_float f)
   | Naked_int32 i -> int32 ~dbg i
   | Naked_int64 i -> int64 ~dbg i
+  | Naked_vec128 i ->
+    let { Numeric_types.Vec128_by_bit_pattern.high; low } =
+      Numeric_types.Vec128_by_bit_pattern.to_bits i
+    in
+    vec128 ~dbg { high; low }
   | Naked_nativeint t -> targetint ~dbg t
 
 let simple ?consider_inlining_effectful_expressions ~dbg env res s =
@@ -163,7 +172,14 @@ let const_static cst =
            (tag_targetint (Targetint_31_63.to_targetint i))) ]
   | Naked_float f -> [cfloat (Numeric_types.Float_by_bit_pattern.to_float f)]
   | Naked_int32 i -> [cint (Nativeint.of_int32 i)]
+  (* We don't compile flambda-backend in 32-bit mode, so nativeint is 64
+     bits. *)
   | Naked_int64 i -> [cint (Int64.to_nativeint i)]
+  | Naked_vec128 v ->
+    let { Numeric_types.Vec128_by_bit_pattern.high; low } =
+      Numeric_types.Vec128_by_bit_pattern.to_bits v
+    in
+    [cvec128 { high; low }]
   | Naked_nativeint t -> [cint (nativeint_of_targetint t)]
 
 let simple_static res s =
diff --git a/middle_end/flambda2/to_cmm/to_cmm_static.ml b/middle_end/flambda2/to_cmm/to_cmm_static.ml
index 468e87adf22..cfcc47862b7 100644
--- a/middle_end/flambda2/to_cmm/to_cmm_static.ml
+++ b/middle_end/flambda2/to_cmm/to_cmm_static.ml
@@ -158,6 +158,20 @@ let static_const0 env res ~updates (bound_static : Bound_static.Pattern.t)
         ~emit:C.emit_nativeint_constant ~transl ~structured v res updates
     in
     env, res, updates
+  | Block_like symbol, Boxed_vec128 v ->
+    let default = Numeric_types.Vec128_by_bit_pattern.zero in
+    let transl v =
+      let { Numeric_types.Vec128_by_bit_pattern.high; low } =
+        Numeric_types.Vec128_by_bit_pattern.to_bits v
+      in
+      { Cmm.high; low }
+    in
+    let structured { Cmm.high; low } = Clambda.Uconst_vec128 { high; low } in
+    let res, env, updates =
+      static_boxed_number ~kind:Onetwentyeight ~env ~symbol ~default
+        ~emit:C.emit_vec128_constant ~transl ~structured v res updates
+    in
+    env, res, updates
   | Block_like s, (Immutable_float_block fields | Immutable_float_array fields)
     ->
     let aux =
@@ -197,7 +211,7 @@ let static_const0 env res ~updates (bound_static : Bound_static.Pattern.t)
       "[Set_of_closures] values cannot be bound by [Block_like] bindings:@ %a"
       SC.print static_const
   | ( (Code _ | Set_of_closures _),
-      ( Block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _
+      ( Block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _ | Boxed_vec128 _
       | Boxed_nativeint _ | Immutable_float_block _ | Immutable_float_array _
       | Immutable_value_array _ | Empty_array | Mutable_string _
       | Immutable_string _ ) ) ->
diff --git a/middle_end/flambda2/types/env/typing_env.ml b/middle_end/flambda2/types/env/typing_env.ml
index 5c7f86841d1..94c691d3032 100644
--- a/middle_end/flambda2/types/env/typing_env.ml
+++ b/middle_end/flambda2/types/env/typing_env.ml
@@ -1308,14 +1308,14 @@ end = struct
               match Reg_width_const.descr const with
               | Tagged_immediate i -> VA.Value_int i
               | Naked_immediate _ | Naked_float _ | Naked_int32 _
-              | Naked_int64 _ | Naked_nativeint _ ->
+              | Naked_vec128 _ | Naked_int64 _ | Naked_nativeint _ ->
                 VA.Value_unknown)
             ~var:(fun _ ~coercion:_ -> VA.Value_unknown)
             ~symbol:(fun symbol ~coercion:_ -> VA.Value_symbol symbol)
         | Ok (No_alias head) -> (
           match head with
           | Mutable_block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _
-          | Boxed_nativeint _ | String _ | Array _ ->
+          | Boxed_vec128 _ | Boxed_nativeint _ | String _ | Array _ ->
             Value_unknown
           | Closures { by_function_slot; alloc_mode = _ } -> (
             match TG.Row_like_for_closures.get_singleton by_function_slot with
@@ -1357,7 +1357,7 @@ end = struct
                 Block_approximation (Array.of_list fields, alloc_mode)
             else Value_unknown))
       | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-      | Naked_nativeint _ | Rec_info _ | Region _ ->
+      | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
         assert false
     in
     let symbol_ty, _binding_time_and_mode =
diff --git a/middle_end/flambda2/types/expand_head.ml b/middle_end/flambda2/types/expand_head.ml
index a7c8091f312..17ff71a9bfd 100644
--- a/middle_end/flambda2/types/expand_head.ml
+++ b/middle_end/flambda2/types/expand_head.ml
@@ -37,6 +37,8 @@ module Expanded_type : sig
 
   val create_naked_nativeint : Type_grammar.head_of_kind_naked_nativeint -> t
 
+  val create_naked_vec128 : Type_grammar.head_of_kind_naked_vec128 -> t
+
   val create_rec_info : Type_grammar.head_of_kind_rec_info -> t
 
   val create_region : Type_grammar.head_of_kind_region -> t
@@ -62,6 +64,7 @@ module Expanded_type : sig
     | Naked_int32 of Type_grammar.head_of_kind_naked_int32
     | Naked_int64 of Type_grammar.head_of_kind_naked_int64
     | Naked_nativeint of Type_grammar.head_of_kind_naked_nativeint
+    | Naked_vec128 of Type_grammar.head_of_kind_naked_vec128
     | Rec_info of Type_grammar.head_of_kind_rec_info
     | Region of Type_grammar.head_of_kind_region
 
@@ -79,6 +82,8 @@ module Expanded_type : sig
         Type_grammar.head_of_kind_naked_int64 Or_unknown_or_bottom.t
     | Naked_nativeint of
         Type_grammar.head_of_kind_naked_nativeint Or_unknown_or_bottom.t
+    | Naked_vec128 of
+        Type_grammar.head_of_kind_naked_vec128 Or_unknown_or_bottom.t
     | Rec_info of Type_grammar.head_of_kind_rec_info Or_unknown_or_bottom.t
     | Region of Type_grammar.head_of_kind_region Or_unknown_or_bottom.t
 
@@ -91,6 +96,7 @@ end = struct
     | Naked_int32 of TG.head_of_kind_naked_int32
     | Naked_int64 of TG.head_of_kind_naked_int64
     | Naked_nativeint of TG.head_of_kind_naked_nativeint
+    | Naked_vec128 of TG.head_of_kind_naked_vec128
     | Rec_info of TG.head_of_kind_rec_info
     | Region of TG.head_of_kind_region
 
@@ -118,6 +124,9 @@ end = struct
   let create_naked_nativeint head =
     { kind = K.naked_nativeint; descr = Ok (Naked_nativeint head) }
 
+  let create_naked_vec128 head =
+    { kind = K.naked_vec128; descr = Ok (Naked_vec128 head) }
+
   let create_rec_info head = { kind = K.rec_info; descr = Ok (Rec_info head) }
 
   let create_region head = { kind = K.region; descr = Ok (Region head) }
@@ -165,6 +174,15 @@ end = struct
         match TG.apply_coercion_head_of_kind_naked_float head coercion with
         | Bottom -> create_bottom K.naked_float
         | Ok head -> create_naked_float head))
+    | Naked_vec128 Unknown -> create_unknown K.naked_vec128
+    | Naked_vec128 Bottom -> create_bottom K.naked_vec128
+    | Naked_vec128 (Ok (No_alias head)) -> (
+      match coercion with
+      | None -> create_naked_vec128 head
+      | Some coercion -> (
+        match TG.apply_coercion_head_of_kind_naked_vec128 head coercion with
+        | Bottom -> create_bottom K.naked_vec128
+        | Ok head -> create_naked_vec128 head))
     | Naked_int32 Unknown -> create_unknown K.naked_int32
     | Naked_int32 Bottom -> create_bottom K.naked_int32
     | Naked_int32 (Ok (No_alias head)) -> (
@@ -213,6 +231,7 @@ end = struct
     | Value (Ok (Equals _))
     | Naked_immediate (Ok (Equals _))
     | Naked_float (Ok (Equals _))
+    | Naked_vec128 (Ok (Equals _))
     | Naked_int32 (Ok (Equals _))
     | Naked_int64 (Ok (Equals _))
     | Naked_nativeint (Ok (Equals _))
@@ -232,6 +251,7 @@ end = struct
       | Naked_int32 head -> TG.create_from_head_naked_int32 head
       | Naked_int64 head -> TG.create_from_head_naked_int64 head
       | Naked_nativeint head -> TG.create_from_head_naked_nativeint head
+      | Naked_vec128 head -> TG.create_from_head_naked_vec128 head
       | Rec_info head -> TG.create_from_head_rec_info head
       | Region head -> TG.create_from_head_region head)
 
@@ -247,6 +267,8 @@ end = struct
         Type_grammar.head_of_kind_naked_int64 Or_unknown_or_bottom.t
     | Naked_nativeint of
         Type_grammar.head_of_kind_naked_nativeint Or_unknown_or_bottom.t
+    | Naked_vec128 of
+        Type_grammar.head_of_kind_naked_vec128 Or_unknown_or_bottom.t
     | Rec_info of Type_grammar.head_of_kind_rec_info Or_unknown_or_bottom.t
     | Region of Type_grammar.head_of_kind_region Or_unknown_or_bottom.t
 
@@ -260,6 +282,7 @@ end = struct
       | Naked_number Naked_int32 -> Naked_int32 Unknown
       | Naked_number Naked_int64 -> Naked_int64 Unknown
       | Naked_number Naked_nativeint -> Naked_nativeint Unknown
+      | Naked_number Naked_vec128 -> Naked_vec128 Unknown
       | Rec_info -> Rec_info Unknown
       | Region -> Region Unknown)
     | Bottom -> (
@@ -270,6 +293,7 @@ end = struct
       | Naked_number Naked_int32 -> Naked_int32 Bottom
       | Naked_number Naked_int64 -> Naked_int64 Bottom
       | Naked_number Naked_nativeint -> Naked_nativeint Bottom
+      | Naked_number Naked_vec128 -> Naked_vec128 Bottom
       | Rec_info -> Rec_info Bottom
       | Region -> Region Bottom)
     | Ok (Value head) -> Value (Ok head)
@@ -278,6 +302,7 @@ end = struct
     | Ok (Naked_int32 head) -> Naked_int32 (Ok head)
     | Ok (Naked_int64 head) -> Naked_int64 (Ok head)
     | Ok (Naked_nativeint head) -> Naked_nativeint (Ok head)
+    | Ok (Naked_vec128 head) -> Naked_vec128 (Ok head)
     | Ok (Rec_info head) -> Rec_info (Ok head)
     | Ok (Region head) -> Region (Ok head)
 end
@@ -312,7 +337,9 @@ let expand_head_of_alias_type env kind
       | Naked_int64 i ->
         ET.create_naked_int64 (TG.Head_of_kind_naked_int64.create i)
       | Naked_nativeint i ->
-        ET.create_naked_nativeint (TG.Head_of_kind_naked_nativeint.create i))
+        ET.create_naked_nativeint (TG.Head_of_kind_naked_nativeint.create i)
+      | Naked_vec128 i ->
+        ET.create_naked_vec128 (TG.Head_of_kind_naked_vec128.create i))
     ~name
 
 let expand_head0 env ty ~known_canonical_simple_at_in_types_mode =
diff --git a/middle_end/flambda2/types/expand_head.mli b/middle_end/flambda2/types/expand_head.mli
index 7e88deb019e..ccc1714fe7b 100644
--- a/middle_end/flambda2/types/expand_head.mli
+++ b/middle_end/flambda2/types/expand_head.mli
@@ -32,6 +32,8 @@ module Expanded_type : sig
 
   val create_naked_nativeint : Type_grammar.head_of_kind_naked_nativeint -> t
 
+  val create_naked_vec128 : Type_grammar.head_of_kind_naked_vec128 -> t
+
   val create_rec_info : Type_grammar.head_of_kind_rec_info -> t
 
   val create_region : Type_grammar.head_of_kind_region -> t
@@ -57,6 +59,7 @@ module Expanded_type : sig
     | Naked_int32 of Type_grammar.head_of_kind_naked_int32
     | Naked_int64 of Type_grammar.head_of_kind_naked_int64
     | Naked_nativeint of Type_grammar.head_of_kind_naked_nativeint
+    | Naked_vec128 of Type_grammar.head_of_kind_naked_vec128
     | Rec_info of Type_grammar.head_of_kind_rec_info
     | Region of Type_grammar.head_of_kind_region
 
@@ -74,6 +77,8 @@ module Expanded_type : sig
         Type_grammar.head_of_kind_naked_int64 Or_unknown_or_bottom.t
     | Naked_nativeint of
         Type_grammar.head_of_kind_naked_nativeint Or_unknown_or_bottom.t
+    | Naked_vec128 of
+        Type_grammar.head_of_kind_naked_vec128 Or_unknown_or_bottom.t
     | Rec_info of Type_grammar.head_of_kind_rec_info Or_unknown_or_bottom.t
     | Region of Type_grammar.head_of_kind_region Or_unknown_or_bottom.t
 
diff --git a/middle_end/flambda2/types/flambda2_types.mli b/middle_end/flambda2/types/flambda2_types.mli
index 5301e1c546a..8406b81f3f9 100644
--- a/middle_end/flambda2/types/flambda2_types.mli
+++ b/middle_end/flambda2/types/flambda2_types.mli
@@ -365,6 +365,9 @@ val this_boxed_int64 : Numeric_types.Int64.t -> Alloc_mode.For_types.t -> t
 
 val this_boxed_nativeint : Targetint_32_64.t -> Alloc_mode.For_types.t -> t
 
+val this_boxed_vec128 :
+  Numeric_types.Vec128_by_bit_pattern.t -> Alloc_mode.For_types.t -> t
+
 val these_tagged_immediates : Targetint_31_63.Set.t -> t
 
 val these_boxed_floats :
@@ -412,6 +415,9 @@ val boxed_int64_alias_to : naked_int64:Variable.t -> Alloc_mode.For_types.t -> t
 val boxed_nativeint_alias_to :
   naked_nativeint:Variable.t -> Alloc_mode.For_types.t -> t
 
+val boxed_vec128_alias_to :
+  naked_vec128:Variable.t -> Alloc_mode.For_types.t -> t
+
 val box_float : t -> Alloc_mode.For_types.t -> t
 
 val box_int32 : t -> Alloc_mode.For_types.t -> t
@@ -420,6 +426,8 @@ val box_int64 : t -> Alloc_mode.For_types.t -> t
 
 val box_nativeint : t -> Alloc_mode.For_types.t -> t
 
+val box_vec128 : t -> Alloc_mode.For_types.t -> t
+
 val tagged_immediate_alias_to : naked_immediate:Variable.t -> t
 
 val tag_immediate : t -> t
@@ -599,6 +607,8 @@ val prove_is_a_boxed_int64 : Typing_env.t -> t -> unit proof_of_property
 
 val prove_is_a_boxed_nativeint : Typing_env.t -> t -> unit proof_of_property
 
+val prove_is_a_boxed_vec128 : Typing_env.t -> t -> unit proof_of_property
+
 val prove_is_or_is_not_a_boxed_float :
   Typing_env.t -> t -> bool proof_of_property
 
@@ -675,6 +685,9 @@ val meet_boxed_int64_containing_simple :
 val meet_boxed_nativeint_containing_simple :
   Typing_env.t -> min_name_mode:Name_mode.t -> t -> Simple.t meet_shortcut
 
+val meet_boxed_vec128_containing_simple :
+  Typing_env.t -> min_name_mode:Name_mode.t -> t -> Simple.t meet_shortcut
+
 val meet_block_field_simple :
   Typing_env.t ->
   min_name_mode:Name_mode.t ->
@@ -714,6 +727,7 @@ type to_lift = private
   | Boxed_int32 of Numeric_types.Int32.t
   | Boxed_int64 of Numeric_types.Int64.t
   | Boxed_nativeint of Targetint_32_64.t
+  | Boxed_vec128 of Numeric_types.Vec128_by_bit_pattern.t
   | Immutable_float_array of
       { fields : Numeric_types.Float_by_bit_pattern.t list }
   | Immutable_value_array of { fields : Simple.t list }
diff --git a/middle_end/flambda2/types/grammar/more_type_creators.ml b/middle_end/flambda2/types/grammar/more_type_creators.ml
index 298f030fa7b..f2f1ef746db 100644
--- a/middle_end/flambda2/types/grammar/more_type_creators.ml
+++ b/middle_end/flambda2/types/grammar/more_type_creators.ml
@@ -26,6 +26,7 @@ let unknown (kind : K.t) =
   | Naked_number Naked_int32 -> TG.any_naked_int32
   | Naked_number Naked_int64 -> TG.any_naked_int64
   | Naked_number Naked_nativeint -> TG.any_naked_nativeint
+  | Naked_number Naked_vec128 -> TG.any_naked_vec128
   | Rec_info -> TG.any_rec_info
   | Region -> TG.any_region
 
@@ -39,6 +40,7 @@ let bottom (kind : K.t) =
   | Naked_number Naked_int32 -> TG.bottom_naked_int32
   | Naked_number Naked_int64 -> TG.bottom_naked_int64
   | Naked_number Naked_nativeint -> TG.bottom_naked_nativeint
+  | Naked_number Naked_vec128 -> TG.bottom_naked_vec128
   | Rec_info -> TG.bottom_rec_info
   | Region -> TG.bottom_region
 
@@ -87,6 +89,9 @@ let this_boxed_int64 i alloc_mode =
 let this_boxed_nativeint i alloc_mode =
   TG.box_nativeint (TG.this_naked_nativeint i) alloc_mode
 
+let this_boxed_vec128 i alloc_mode =
+  TG.box_vec128 (TG.this_naked_vec128 i) alloc_mode
+
 let these_boxed_floats fs alloc_mode =
   TG.box_float (these_naked_floats fs) alloc_mode
 
@@ -111,6 +116,9 @@ let any_boxed_int64 =
 let any_boxed_nativeint =
   TG.box_nativeint TG.any_naked_nativeint (Alloc_mode.For_types.unknown ())
 
+let any_boxed_vec128 =
+  TG.box_vec128 TG.any_naked_vec128 (Alloc_mode.For_types.unknown ())
+
 let any_block =
   TG.create_variant ~is_unique:false
     ~immediates:(Known TG.bottom_naked_immediate) ~blocks:Unknown
@@ -255,6 +263,7 @@ let type_for_const const =
   | Naked_int32 n -> TG.this_naked_int32 n
   | Naked_int64 n -> TG.this_naked_int64 n
   | Naked_nativeint n -> TG.this_naked_nativeint n
+  | Naked_vec128 n -> TG.this_naked_vec128 n
 
 let kind_for_const const = TG.kind (type_for_const const)
 
@@ -290,12 +299,14 @@ let rec unknown_with_subkind ?(alloc_mode = Alloc_mode.For_types.unknown ())
     | Naked_number Naked_int32 -> TG.any_naked_int32
     | Naked_number Naked_int64 -> TG.any_naked_int64
     | Naked_number Naked_nativeint -> TG.any_naked_nativeint
+    | Naked_number Naked_vec128 -> TG.any_naked_vec128
     | Rec_info -> TG.any_rec_info
     | Region -> TG.any_region)
   | Boxed_float -> any_boxed_float
   | Boxed_int32 -> any_boxed_int32
   | Boxed_int64 -> any_boxed_int64
   | Boxed_nativeint -> any_boxed_nativeint
+  | Boxed_vec128 -> any_boxed_vec128
   | Tagged_immediate -> any_tagged_immediate
   | Variant { consts; non_consts } ->
     let const_ctors = these_naked_immediates consts in
diff --git a/middle_end/flambda2/types/grammar/more_type_creators.mli b/middle_end/flambda2/types/grammar/more_type_creators.mli
index 46e6df57046..988630953fc 100644
--- a/middle_end/flambda2/types/grammar/more_type_creators.mli
+++ b/middle_end/flambda2/types/grammar/more_type_creators.mli
@@ -58,6 +58,11 @@ val this_boxed_int64 : int64 -> Alloc_mode.For_types.t -> Type_grammar.t
 val this_boxed_nativeint :
   Targetint_32_64.t -> Alloc_mode.For_types.t -> Type_grammar.t
 
+val this_boxed_vec128 :
+  Numeric_types.Vec128_by_bit_pattern.t ->
+  Alloc_mode.For_types.t ->
+  Type_grammar.t
+
 val these_boxed_floats :
   Numeric_types.Float_by_bit_pattern.Set.t ->
   Alloc_mode.For_types.t ->
diff --git a/middle_end/flambda2/types/grammar/type_grammar.ml b/middle_end/flambda2/types/grammar/type_grammar.ml
index 99f80cf67be..2f88f31cf42 100644
--- a/middle_end/flambda2/types/grammar/type_grammar.ml
+++ b/middle_end/flambda2/types/grammar/type_grammar.ml
@@ -16,6 +16,7 @@
 
 module K = Flambda_kind
 module Float = Numeric_types.Float_by_bit_pattern
+module Vec128 = Numeric_types.Vec128_by_bit_pattern
 module Int32 = Numeric_types.Int32
 module Int64 = Numeric_types.Int64
 module RWC = Reg_width_const
@@ -44,6 +45,7 @@ type t =
   | Naked_int32 of head_of_kind_naked_int32 TD.t
   | Naked_int64 of head_of_kind_naked_int64 TD.t
   | Naked_nativeint of head_of_kind_naked_nativeint TD.t
+  | Naked_vec128 of head_of_kind_naked_vec128 TD.t
   | Rec_info of head_of_kind_rec_info TD.t
   | Region of head_of_kind_region TD.t
 
@@ -58,6 +60,7 @@ and head_of_kind_value =
   | Boxed_int32 of t * Alloc_mode.For_types.t
   | Boxed_int64 of t * Alloc_mode.For_types.t
   | Boxed_nativeint of t * Alloc_mode.For_types.t
+  | Boxed_vec128 of t * Alloc_mode.For_types.t
   | Closures of
       { by_function_slot : row_like_for_closures;
         alloc_mode : Alloc_mode.For_types.t
@@ -110,6 +113,8 @@ and head_of_kind_naked_int64 = Int64.Set.t
 
 and head_of_kind_naked_nativeint = Targetint_32_64.Set.t
 
+and head_of_kind_naked_vec128 = Vec128.Set.t
+
 and head_of_kind_rec_info = Rec_info_expr.t
 
 and head_of_kind_region = unit
@@ -219,6 +224,9 @@ let rec free_names0 ~follow_value_slots t =
   | Naked_nativeint ty ->
     type_descr_free_names
       ~free_names_head:free_names_head_of_kind_naked_nativeint ty
+  | Naked_vec128 ty ->
+    type_descr_free_names ~free_names_head:free_names_head_of_kind_naked_vec128
+      ty
   | Rec_info ty ->
     type_descr_free_names ~free_names_head:free_names_head_of_kind_rec_info ty
   | Region ty ->
@@ -237,6 +245,7 @@ and free_names_head_of_kind_value0 ~follow_value_slots head =
   | Boxed_int32 (ty, _alloc_mode) -> free_names0 ~follow_value_slots ty
   | Boxed_int64 (ty, _alloc_mode) -> free_names0 ~follow_value_slots ty
   | Boxed_nativeint (ty, _alloc_mode) -> free_names0 ~follow_value_slots ty
+  | Boxed_vec128 (ty, _alloc_mode) -> free_names0 ~follow_value_slots ty
   | Closures { by_function_slot; alloc_mode = _ } ->
     free_names_row_like_for_closures ~follow_value_slots by_function_slot
   | String _ -> Name_occurrences.empty
@@ -271,6 +280,8 @@ and free_names_head_of_kind_naked_int64 _ = Name_occurrences.empty
 
 and free_names_head_of_kind_naked_nativeint _ = Name_occurrences.empty
 
+and free_names_head_of_kind_naked_vec128 _ = Name_occurrences.empty
+
 and free_names_head_of_kind_rec_info head =
   Rec_info_expr.free_names_in_types head
 
@@ -451,6 +462,13 @@ let rec apply_renaming t renaming =
           ~free_names_head:free_names_head_of_kind_naked_nativeint ty renaming
       in
       if ty == ty' then t else Naked_nativeint ty'
+    | Naked_vec128 ty ->
+      let ty' =
+        TD.apply_renaming
+          ~apply_renaming_head:apply_renaming_head_of_kind_naked_vec128
+          ~free_names_head:free_names_head_of_kind_naked_vec128 ty renaming
+      in
+      if ty == ty' then t else Naked_vec128 ty'
     | Rec_info ty ->
       let ty' =
         TD.apply_renaming
@@ -493,6 +511,9 @@ and apply_renaming_head_of_kind_value head renaming =
   | Boxed_nativeint (ty, alloc_mode) ->
     let ty' = apply_renaming ty renaming in
     if ty == ty' then head else Boxed_nativeint (ty', alloc_mode)
+  | Boxed_vec128 (ty, alloc_mode) ->
+    let ty' = apply_renaming ty renaming in
+    if ty == ty' then head else Boxed_vec128 (ty', alloc_mode)
   | Closures { by_function_slot; alloc_mode } ->
     let by_function_slot' =
       apply_renaming_row_like_for_closures by_function_slot renaming
@@ -554,6 +575,8 @@ and apply_renaming_head_of_kind_naked_int64 head _ = head
 
 and apply_renaming_head_of_kind_naked_nativeint head _ = head
 
+and apply_renaming_head_of_kind_naked_vec128 head _ = head
+
 and apply_renaming_head_of_kind_rec_info head renaming =
   Rec_info_expr.apply_renaming head renaming
 
@@ -712,6 +735,10 @@ let rec print ppf t =
     Format.fprintf ppf "@[<hov 1>(Naked_nativeint@ %a)@]"
       (TD.print ~print_head:print_head_of_kind_naked_nativeint)
       ty
+  | Naked_vec128 ty ->
+    Format.fprintf ppf "@[<hov 1>(Naked_vec128@ %a)@]"
+      (TD.print ~print_head:print_head_of_kind_naked_vec128)
+      ty
   | Rec_info ty ->
     Format.fprintf ppf "@[<hov 1>(Rec_info@ %a)@]"
       (TD.print ~print_head:print_head_of_kind_rec_info)
@@ -747,6 +774,9 @@ and print_head_of_kind_value ppf head =
   | Boxed_nativeint (ty, alloc_mode) ->
     Format.fprintf ppf "@[<hov 1>(Boxed_nativeint@ %a@ %a)@]"
       Alloc_mode.For_types.print alloc_mode print ty
+  | Boxed_vec128 (ty, alloc_mode) ->
+    Format.fprintf ppf "@[<hov 1>(Boxed_vec128@ %a@ %a)@]"
+      Alloc_mode.For_types.print alloc_mode print ty
   | Closures { by_function_slot; alloc_mode } ->
     print_row_like_for_closures alloc_mode ppf by_function_slot
   | String str_infos ->
@@ -797,6 +827,9 @@ and print_head_of_kind_naked_nativeint ppf head =
   Format.fprintf ppf "@[(Naked_nativeint@ (%a))@]" Targetint_32_64.Set.print
     head
 
+and print_head_of_kind_naked_vec128 ppf head =
+  Format.fprintf ppf "@[(Naked_vec128@ (%a))@]" Vec128.Set.print head
+
 and print_head_of_kind_rec_info ppf head = Rec_info_expr.print ppf head
 
 and print_head_of_kind_region ppf () = Format.pp_print_string ppf "Region"
@@ -925,6 +958,9 @@ let rec ids_for_export t =
   | Naked_nativeint ty ->
     TD.ids_for_export
       ~ids_for_export_head:ids_for_export_head_of_kind_naked_nativeint ty
+  | Naked_vec128 ty ->
+    TD.ids_for_export
+      ~ids_for_export_head:ids_for_export_head_of_kind_naked_vec128 ty
   | Rec_info ty ->
     TD.ids_for_export ~ids_for_export_head:ids_for_export_head_of_kind_rec_info
       ty
@@ -942,6 +978,7 @@ and ids_for_export_head_of_kind_value head =
   | Boxed_int32 (t, _alloc_mode) -> ids_for_export t
   | Boxed_int64 (t, _alloc_mode) -> ids_for_export t
   | Boxed_nativeint (t, _alloc_mode) -> ids_for_export t
+  | Boxed_vec128 (t, _alloc_mode) -> ids_for_export t
   | Closures { by_function_slot; alloc_mode = _ } ->
     ids_for_export_row_like_for_closures by_function_slot
   | String _ -> Ids_for_export.empty
@@ -972,6 +1009,8 @@ and ids_for_export_head_of_kind_naked_int32 _ = Ids_for_export.empty
 
 and ids_for_export_head_of_kind_naked_int64 _ = Ids_for_export.empty
 
+and ids_for_export_head_of_kind_naked_vec128 _ = Ids_for_export.empty
+
 and ids_for_export_head_of_kind_naked_nativeint _ = Ids_for_export.empty
 
 and ids_for_export_head_of_kind_rec_info head =
@@ -1124,6 +1163,13 @@ let rec apply_coercion t coercion : t Or_bottom.t =
           coercion ty
       in
       if ty == ty' then t else Naked_nativeint ty'
+    | Naked_vec128 ty ->
+      let<+ ty' =
+        TD.apply_coercion
+          ~apply_coercion_head:apply_coercion_head_of_kind_naked_vec128 coercion
+          ty
+      in
+      if ty == ty' then t else Naked_vec128 ty'
     | Rec_info ty ->
       let<+ ty' =
         TD.apply_coercion
@@ -1157,7 +1203,8 @@ and apply_coercion_head_of_kind_value head coercion : _ Or_bottom.t =
        to have a [Boxed_float] wrapper that would lift a float coercion to a
        value coercion. *)
     if Coercion.is_id coercion then Ok head else Bottom
-  | Boxed_int32 _ | Boxed_int64 _ | Boxed_nativeint _ | String _ ->
+  | Boxed_int32 _ | Boxed_int64 _ | Boxed_nativeint _ | Boxed_vec128 _
+  | String _ ->
     (* Similarly, we don't have lifted coercions for these. *)
     if Coercion.is_id coercion then Ok head else Bottom
   | Array
@@ -1193,6 +1240,9 @@ and apply_coercion_head_of_kind_naked_int64 head coercion : _ Or_bottom.t =
 and apply_coercion_head_of_kind_naked_nativeint head coercion : _ Or_bottom.t =
   if Coercion.is_id coercion then Ok head else Bottom
 
+and apply_coercion_head_of_kind_naked_vec128 head coercion : _ Or_bottom.t =
+  if Coercion.is_id coercion then Ok head else Bottom
+
 and apply_coercion_head_of_kind_rec_info head coercion : _ Or_bottom.t =
   (* Currently no coercion has an effect on a depth variable and
      [Rec_info_expr.t] does not contain any other variety of name. *)
@@ -1440,6 +1490,14 @@ let rec remove_unused_value_slots_and_shortcut_aliases t ~used_value_slots
           remove_unused_value_slots_and_shortcut_aliases_head_of_kind_naked_nativeint
     in
     if ty == ty' then t else Naked_nativeint ty'
+  | Naked_vec128 ty ->
+    let ty' =
+      TD.remove_unused_value_slots_and_shortcut_aliases ty ~used_value_slots
+        ~canonicalise
+        ~remove_unused_value_slots_and_shortcut_aliases_head:
+          remove_unused_value_slots_and_shortcut_aliases_head_of_kind_naked_vec128
+    in
+    if ty == ty' then t else Naked_vec128 ty'
   | Rec_info ty ->
     let ty' =
       TD.remove_unused_value_slots_and_shortcut_aliases ty ~used_value_slots
@@ -1499,6 +1557,12 @@ and remove_unused_value_slots_and_shortcut_aliases_head_of_kind_value head
         ~canonicalise
     in
     if ty == ty' then head else Boxed_nativeint (ty', alloc_mode)
+  | Boxed_vec128 (ty, alloc_mode) ->
+    let ty' =
+      remove_unused_value_slots_and_shortcut_aliases ty ~used_value_slots
+        ~canonicalise
+    in
+    if ty == ty' then head else Boxed_vec128 (ty', alloc_mode)
   | Closures { by_function_slot; alloc_mode } ->
     let by_function_slot' =
       remove_unused_value_slots_and_shortcut_aliases_row_like_for_closures
@@ -1586,6 +1650,10 @@ and remove_unused_value_slots_and_shortcut_aliases_head_of_kind_naked_nativeint
     head ~used_value_slots:_ ~canonicalise:_ =
   head
 
+and remove_unused_value_slots_and_shortcut_aliases_head_of_kind_naked_vec128
+    head ~used_value_slots:_ ~canonicalise:_ =
+  head
+
 and remove_unused_value_slots_and_shortcut_aliases_head_of_kind_rec_info head
     ~used_value_slots:_ ~canonicalise:_ =
   head
@@ -1778,7 +1846,7 @@ let rec project_variables_out ~to_project ~expand t =
       match apply_coercion (expand var) coercion with
       | Value ty -> ty
       | ( Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-        | Naked_nativeint _ | Rec_info _ | Region _ ) as ty ->
+        | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ) as ty ->
         Misc.fatal_errorf
           "Wrong kind while expanding %a: expecting [Value], got type %a"
           Variable.print var print ty
@@ -1795,7 +1863,7 @@ let rec project_variables_out ~to_project ~expand t =
       match apply_coercion (expand var) coercion with
       | Naked_immediate ty -> ty
       | ( Value _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-        | Naked_nativeint _ | Rec_info _ | Region _ ) as ty ->
+        | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ) as ty ->
         Misc.fatal_errorf
           "Wrong kind while expanding %a: expecting [Naked_immediate], got \
            type %a"
@@ -1814,7 +1882,7 @@ let rec project_variables_out ~to_project ~expand t =
       match apply_coercion (expand var) coercion with
       | Naked_float ty -> ty
       | ( Value _ | Naked_immediate _ | Naked_int32 _ | Naked_int64 _
-        | Naked_nativeint _ | Rec_info _ | Region _ ) as ty ->
+        | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ) as ty ->
         Misc.fatal_errorf
           "Wrong kind while expanding %a: expecting [Naked_float], got type %a"
           Variable.print var print ty
@@ -1832,7 +1900,7 @@ let rec project_variables_out ~to_project ~expand t =
       match apply_coercion (expand var) coercion with
       | Naked_int32 ty -> ty
       | ( Value _ | Naked_immediate _ | Naked_float _ | Naked_int64 _
-        | Naked_nativeint _ | Rec_info _ | Region _ ) as ty ->
+        | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ) as ty ->
         Misc.fatal_errorf
           "Wrong kind while expanding %a: expecting [Naked_int32], got type %a"
           Variable.print var print ty
@@ -1850,7 +1918,7 @@ let rec project_variables_out ~to_project ~expand t =
       match apply_coercion (expand var) coercion with
       | Naked_int64 ty -> ty
       | ( Value _ | Naked_immediate _ | Naked_float _ | Naked_int32 _
-        | Naked_nativeint _ | Rec_info _ | Region _ ) as ty ->
+        | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ) as ty ->
         Misc.fatal_errorf
           "Wrong kind while expanding %a: expecting [Naked_int64], got type %a"
           Variable.print var print ty
@@ -1868,7 +1936,7 @@ let rec project_variables_out ~to_project ~expand t =
       match apply_coercion (expand var) coercion with
       | Naked_nativeint ty -> ty
       | ( Value _ | Naked_immediate _ | Naked_float _ | Naked_int32 _
-        | Naked_int64 _ | Rec_info _ | Region _ ) as ty ->
+        | Naked_vec128 _ | Naked_int64 _ | Rec_info _ | Region _ ) as ty ->
         Misc.fatal_errorf
           "Wrong kind while expanding %a: expecting [Naked_nativeint], got \
            type %a"
@@ -1882,12 +1950,31 @@ let rec project_variables_out ~to_project ~expand t =
         ty
     in
     if ty == ty' then t else Naked_nativeint ty'
+  | Naked_vec128 ty ->
+    let expand_with_coercion var ~coercion =
+      match apply_coercion (expand var) coercion with
+      | Naked_vec128 ty -> ty
+      | ( Value _ | Naked_immediate _ | Naked_float _ | Naked_int32 _
+        | Naked_nativeint _ | Naked_int64 _ | Rec_info _ | Region _ ) as ty ->
+        Misc.fatal_errorf
+          "Wrong kind while expanding %a: expecting [Naked_vec128], got type %a"
+          Variable.print var print ty
+    in
+    let ty' =
+      TD.project_variables_out
+        ~free_names_head:free_names_head_of_kind_naked_vec128 ~to_project
+        ~expand:expand_with_coercion
+        ~project_head:(project_head_of_kind_naked_vec128 ~to_project ~expand)
+        ty
+    in
+    if ty == ty' then t else Naked_vec128 ty'
   | Rec_info ty ->
     let expand_with_coercion var ~coercion =
       match apply_coercion (expand var) coercion with
       | Rec_info ty -> ty
       | ( Value _ | Naked_immediate _ | Naked_float _ | Naked_int32 _
-        | Naked_int64 _ | Naked_nativeint _ | Region _ ) as ty ->
+        | Naked_vec128 _ | Naked_int64 _ | Naked_nativeint _ | Region _ ) as ty
+        ->
         Misc.fatal_errorf
           "Wrong kind while expanding %a: expecting [Rec_info], got type %a"
           Variable.print var print ty
@@ -1904,7 +1991,8 @@ let rec project_variables_out ~to_project ~expand t =
       match apply_coercion (expand var) coercion with
       | Region ty -> ty
       | ( Value _ | Naked_immediate _ | Naked_float _ | Naked_int32 _
-        | Naked_int64 _ | Naked_nativeint _ | Rec_info _ ) as ty ->
+        | Naked_vec128 _ | Naked_int64 _ | Naked_nativeint _ | Rec_info _ ) as
+        ty ->
         Misc.fatal_errorf
           "Wrong kind while expanding %a: expecting [Region], got type %a"
           Variable.print var print ty
@@ -1944,6 +2032,9 @@ and project_head_of_kind_value ~to_project ~expand head =
   | Boxed_nativeint (ty, alloc_mode) ->
     let ty' = project_variables_out ~to_project ~expand ty in
     if ty == ty' then head else Boxed_nativeint (ty', alloc_mode)
+  | Boxed_vec128 (ty, alloc_mode) ->
+    let ty' = project_variables_out ~to_project ~expand ty in
+    if ty == ty' then head else Boxed_vec128 (ty', alloc_mode)
   | Closures { by_function_slot; alloc_mode } ->
     let by_function_slot' =
       project_row_like_for_closures ~to_project ~expand by_function_slot
@@ -2005,6 +2096,8 @@ and project_head_of_kind_naked_int64 ~to_project:_ ~expand:_ head = head
 
 and project_head_of_kind_naked_nativeint ~to_project:_ ~expand:_ head = head
 
+and project_head_of_kind_naked_vec128 ~to_project:_ ~expand:_ head = head
+
 and project_head_of_kind_rec_info ~to_project ~expand:_ head =
   match (head : head_of_kind_rec_info) with
   | Const _ | Succ _ | Unroll_to _ -> head
@@ -2176,6 +2269,7 @@ let kind t =
   | Naked_int32 _ -> K.naked_int32
   | Naked_int64 _ -> K.naked_int64
   | Naked_nativeint _ -> K.naked_nativeint
+  | Naked_vec128 _ -> K.naked_vec128
   | Rec_info _ -> K.rec_info
   | Region _ -> K.region
 
@@ -2383,6 +2477,7 @@ module Row_like_for_blocks = struct
         | Naked_number Naked_int32
         | Naked_number Naked_int64
         | Naked_number Naked_nativeint
+        | Naked_number Naked_vec128
         | Region | Rec_info ->
           Misc.fatal_errorf "Bad kind %a for fields" Flambda_kind.print
             field_kind)
@@ -2404,6 +2499,7 @@ module Row_like_for_blocks = struct
         | Naked_number Naked_int32
         | Naked_number Naked_int64
         | Naked_number Naked_nativeint
+        | Naked_number Naked_vec128
         | Region | Rec_info ->
           Misc.fatal_errorf "Bad kind %a for fields" Flambda_kind.print
             field_kind)
@@ -2628,6 +2724,7 @@ let get_alias_exn t =
   | Naked_int32 ty -> TD.get_alias_exn ty
   | Naked_int64 ty -> TD.get_alias_exn ty
   | Naked_nativeint ty -> TD.get_alias_exn ty
+  | Naked_vec128 ty -> TD.get_alias_exn ty
   | Rec_info ty -> TD.get_alias_exn ty
   | Region ty -> TD.get_alias_exn ty
 
@@ -2642,6 +2739,7 @@ let is_obviously_bottom t =
   | Naked_int32 ty -> TD.is_obviously_bottom ty
   | Naked_int64 ty -> TD.is_obviously_bottom ty
   | Naked_nativeint ty -> TD.is_obviously_bottom ty
+  | Naked_vec128 ty -> TD.is_obviously_bottom ty
   | Rec_info ty -> TD.is_obviously_bottom ty
   | Region ty -> TD.is_obviously_bottom ty
 
@@ -2653,6 +2751,7 @@ let is_obviously_unknown t =
   | Naked_int32 ty -> TD.is_obviously_unknown ty
   | Naked_int64 ty -> TD.is_obviously_unknown ty
   | Naked_nativeint ty -> TD.is_obviously_unknown ty
+  | Naked_vec128 ty -> TD.is_obviously_unknown ty
   | Rec_info ty -> TD.is_obviously_unknown ty
   | Region ty -> TD.is_obviously_unknown ty
 
@@ -2664,6 +2763,7 @@ let alias_type_of (kind : K.t) name : t =
   | Naked_number Naked_int32 -> Naked_int32 (TD.create_equals name)
   | Naked_number Naked_int64 -> Naked_int64 (TD.create_equals name)
   | Naked_number Naked_nativeint -> Naked_nativeint (TD.create_equals name)
+  | Naked_number Naked_vec128 -> Naked_vec128 (TD.create_equals name)
   | Rec_info -> Rec_info (TD.create_equals name)
   | Region -> Region (TD.create_equals name)
 
@@ -2679,6 +2779,8 @@ let bottom_naked_int64 = Naked_int64 TD.bottom
 
 let bottom_naked_nativeint = Naked_nativeint TD.bottom
 
+let bottom_naked_vec128 = Naked_vec128 TD.bottom
+
 let bottom_rec_info = Rec_info TD.bottom
 
 let bottom_region = Region TD.bottom
@@ -2695,6 +2797,8 @@ let any_naked_int64 = Naked_int64 TD.unknown
 
 let any_naked_nativeint = Naked_nativeint TD.unknown
 
+let any_naked_vec128 = Naked_vec128 TD.unknown
+
 let any_region = Region TD.unknown
 
 let any_rec_info = Rec_info TD.unknown
@@ -2714,6 +2818,9 @@ let this_naked_int64 i : t =
 let this_naked_nativeint i : t =
   Naked_nativeint (TD.create_equals (Simple.const (RWC.naked_nativeint i)))
 
+let this_naked_vec128 i : t =
+  Naked_vec128 (TD.create_equals (Simple.const (RWC.naked_vec128 i)))
+
 let these_naked_immediates is =
   match Targetint_31_63.Set.get_singleton is with
   | Some i -> this_naked_immediate i
@@ -2757,21 +2864,21 @@ let these_naked_nativeints is =
 let box_float (t : t) alloc_mode : t =
   match t with
   | Naked_float _ -> Value (TD.create (Boxed_float (t, alloc_mode)))
-  | Value _ | Naked_immediate _ | Naked_int32 _ | Naked_int64 _
+  | Value _ | Naked_immediate _ | Naked_int32 _ | Naked_int64 _ | Naked_vec128 _
   | Naked_nativeint _ | Rec_info _ | Region _ ->
     Misc.fatal_errorf "Type of wrong kind for [box_float]: %a" print t
 
 let box_int32 (t : t) alloc_mode : t =
   match t with
   | Naked_int32 _ -> Value (TD.create (Boxed_int32 (t, alloc_mode)))
-  | Value _ | Naked_immediate _ | Naked_float _ | Naked_int64 _
+  | Value _ | Naked_immediate _ | Naked_float _ | Naked_int64 _ | Naked_vec128 _
   | Naked_nativeint _ | Rec_info _ | Region _ ->
     Misc.fatal_errorf "Type of wrong kind for [box_int32]: %a" print t
 
 let box_int64 (t : t) alloc_mode : t =
   match t with
   | Naked_int64 _ -> Value (TD.create (Boxed_int64 (t, alloc_mode)))
-  | Value _ | Naked_immediate _ | Naked_float _ | Naked_int32 _
+  | Value _ | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_vec128 _
   | Naked_nativeint _ | Rec_info _ | Region _ ->
     Misc.fatal_errorf "Type of wrong kind for [box_int64]: %a" print t
 
@@ -2779,9 +2886,16 @@ let box_nativeint (t : t) alloc_mode : t =
   match t with
   | Naked_nativeint _ -> Value (TD.create (Boxed_nativeint (t, alloc_mode)))
   | Value _ | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Rec_info _ | Region _ ->
     Misc.fatal_errorf "Type of wrong kind for [box_nativeint]: %a" print t
 
+let box_vec128 (t : t) alloc_mode : t =
+  match t with
+  | Naked_vec128 _ -> Value (TD.create (Boxed_vec128 (t, alloc_mode)))
+  | Value _ | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
+  | Naked_nativeint _ | Rec_info _ | Region _ ->
+    Misc.fatal_errorf "Type of wrong kind for [box_vec128]: %a" print t
+
 let this_tagged_immediate imm : t =
   Value (TD.create_equals (Simple.const (RWC.tagged_immediate imm)))
 
@@ -2796,7 +2910,7 @@ let tag_immediate t : t =
               blocks = Known Row_like_for_blocks.bottom
             }))
   | Value _ | Naked_float _ | Naked_int32 _ | Naked_int64 _ | Naked_nativeint _
-  | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Rec_info _ | Region _ ->
     Misc.fatal_errorf "Type of wrong kind for [tag_immediate]: %a" print t
 
 let tagged_immediate_alias_to ~naked_immediate : t =
@@ -2822,6 +2936,9 @@ let boxed_nativeint_alias_to ~naked_nativeint =
   box_nativeint
     (Naked_nativeint (TD.create_equals (Simple.var naked_nativeint)))
 
+let boxed_vec128_alias_to ~naked_vec128 =
+  box_vec128 (Naked_vec128 (TD.create_equals (Simple.var naked_vec128)))
+
 let this_immutable_string str =
   let size = Targetint_31_63.of_int (String.length str) in
   let string_info =
@@ -2874,6 +2991,8 @@ module Descr = struct
     | Naked_int64 of head_of_kind_naked_int64 TD.Descr.t Or_unknown_or_bottom.t
     | Naked_nativeint of
         head_of_kind_naked_nativeint TD.Descr.t Or_unknown_or_bottom.t
+    | Naked_vec128 of
+        head_of_kind_naked_vec128 TD.Descr.t Or_unknown_or_bottom.t
     | Rec_info of head_of_kind_rec_info TD.Descr.t Or_unknown_or_bottom.t
     | Region of head_of_kind_region TD.Descr.t Or_unknown_or_bottom.t
 end
@@ -2886,6 +3005,7 @@ let descr t : Descr.t =
   | Naked_int32 ty -> Naked_int32 (TD.descr ty)
   | Naked_int64 ty -> Naked_int64 (TD.descr ty)
   | Naked_nativeint ty -> Naked_nativeint (TD.descr ty)
+  | Naked_vec128 ty -> Naked_vec128 (TD.descr ty)
   | Rec_info ty -> Rec_info (TD.descr ty)
   | Region ty -> Region (TD.descr ty)
 
@@ -2901,6 +3021,8 @@ let create_from_head_naked_int64 head = Naked_int64 (TD.create head)
 
 let create_from_head_naked_nativeint head = Naked_nativeint (TD.create head)
 
+let create_from_head_naked_vec128 head = Naked_vec128 (TD.create head)
+
 let create_from_head_rec_info head = Rec_info (TD.create head)
 
 let create_from_head_region head = Region (TD.create head)
@@ -2921,6 +3043,8 @@ module Head_of_kind_value = struct
 
   let create_boxed_nativeint ty alloc_mode = Boxed_nativeint (ty, alloc_mode)
 
+  let create_boxed_vec128 ty alloc_mode = Boxed_vec128 (ty, alloc_mode)
+
   let create_tagged_immediate imm : t =
     Variant
       { is_unique = false;
@@ -2993,6 +3117,8 @@ module Head_of_kind_naked_int32 = Make_head_of_kind_naked_number (Int32)
 module Head_of_kind_naked_int64 = Make_head_of_kind_naked_number (Int64)
 module Head_of_kind_naked_nativeint =
   Make_head_of_kind_naked_number (Targetint_32_64)
+module Head_of_kind_naked_vec128 =
+  Make_head_of_kind_naked_number (Numeric_types.Vec128_by_bit_pattern)
 
 let rec recover_some_aliases t =
   match t with
@@ -3003,7 +3129,8 @@ let rec recover_some_aliases t =
     | Ok
         (No_alias
           ( Mutable_block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _
-          | Boxed_nativeint _ | String _ | Closures _ | Array _ )) ->
+          | Boxed_vec128 _ | Boxed_nativeint _ | String _ | Closures _ | Array _
+            )) ->
       t
     | Ok (No_alias (Variant { immediates; blocks; is_unique = _ })) -> (
       match blocks with
@@ -3027,13 +3154,13 @@ let rec recover_some_aliases t =
                     match Reg_width_const.descr const with
                     | Naked_immediate i -> this_tagged_immediate i
                     | Tagged_immediate _ | Naked_float _ | Naked_int32 _
-                    | Naked_int64 _ | Naked_nativeint _ ->
+                    | Naked_int64 _ | Naked_nativeint _ | Naked_vec128 _ ->
                       Misc.fatal_errorf
                         "Immediates case returned wrong kind of constant:@ %a"
                         Reg_width_const.print const)
               | Unknown | Bottom | Ok (No_alias _) -> t)
             | Value _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-            | Naked_nativeint _ | Rec_info _ | Region _ ->
+            | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
               Misc.fatal_errorf "Immediates case returned wrong kind:@ %a" print
                 t' ()))))
   | Naked_immediate ty -> (
@@ -3072,4 +3199,11 @@ let rec recover_some_aliases t =
       match Targetint_32_64.Set.get_singleton is with
       | Some f -> this_naked_nativeint f
       | None -> t))
+  | Naked_vec128 ty -> (
+    match TD.descr ty with
+    | Unknown | Bottom | Ok (Equals _) -> t
+    | Ok (No_alias is) -> (
+      match Vec128.Set.get_singleton is with
+      | Some f -> this_naked_vec128 f
+      | None -> t))
   | Rec_info _ | Region _ -> t
diff --git a/middle_end/flambda2/types/grammar/type_grammar.mli b/middle_end/flambda2/types/grammar/type_grammar.mli
index 42c4a01da1e..7ee96a5bbe3 100644
--- a/middle_end/flambda2/types/grammar/type_grammar.mli
+++ b/middle_end/flambda2/types/grammar/type_grammar.mli
@@ -36,6 +36,7 @@ type t = private
   | Naked_int32 of head_of_kind_naked_int32 Type_descr.t
   | Naked_int64 of head_of_kind_naked_int64 Type_descr.t
   | Naked_nativeint of head_of_kind_naked_nativeint Type_descr.t
+  | Naked_vec128 of head_of_kind_naked_vec128 Type_descr.t
   | Rec_info of head_of_kind_rec_info Type_descr.t
   | Region of head_of_kind_region Type_descr.t
 
@@ -51,6 +52,7 @@ and head_of_kind_value = private
   | Boxed_int32 of t * Alloc_mode.For_types.t
   | Boxed_int64 of t * Alloc_mode.For_types.t
   | Boxed_nativeint of t * Alloc_mode.For_types.t
+  | Boxed_vec128 of t * Alloc_mode.For_types.t
   | Closures of
       { by_function_slot : row_like_for_closures;
         alloc_mode : Alloc_mode.For_types.t
@@ -80,6 +82,9 @@ and head_of_kind_naked_int64 = private Numeric_types.Int64.Set.t
 
 and head_of_kind_naked_nativeint = private Targetint_32_64.Set.t
 
+and head_of_kind_naked_vec128 = private
+  Numeric_types.Vec128_by_bit_pattern.Set.t
+
 and head_of_kind_rec_info = Rec_info_expr.t
 
 and head_of_kind_region = unit
@@ -183,6 +188,8 @@ val bottom_naked_int64 : t
 
 val bottom_naked_nativeint : t
 
+val bottom_naked_vec128 : t
+
 val bottom_rec_info : t
 
 val bottom_region : t
@@ -199,6 +206,8 @@ val any_naked_int64 : t
 
 val any_naked_nativeint : t
 
+val any_naked_vec128 : t
+
 val any_region : t
 
 val any_rec_info : t
@@ -217,6 +226,8 @@ val this_naked_int64 : Numeric_types.Int64.t -> t
 
 val this_naked_nativeint : Targetint_32_64.t -> t
 
+val this_naked_vec128 : Numeric_types.Vec128_by_bit_pattern.t -> t
+
 val these_naked_immediates : Targetint_31_63.Set.t -> t
 
 val these_naked_floats : Numeric_types.Float_by_bit_pattern.Set.t -> t
@@ -236,6 +247,9 @@ val boxed_int64_alias_to : naked_int64:Variable.t -> Alloc_mode.For_types.t -> t
 val boxed_nativeint_alias_to :
   naked_nativeint:Variable.t -> Alloc_mode.For_types.t -> t
 
+val boxed_vec128_alias_to :
+  naked_vec128:Variable.t -> Alloc_mode.For_types.t -> t
+
 val box_float : t -> Alloc_mode.For_types.t -> t
 
 val box_int32 : t -> Alloc_mode.For_types.t -> t
@@ -244,6 +258,8 @@ val box_int64 : t -> Alloc_mode.For_types.t -> t
 
 val box_nativeint : t -> Alloc_mode.For_types.t -> t
 
+val box_vec128 : t -> Alloc_mode.For_types.t -> t
+
 val tagged_immediate_alias_to : naked_immediate:Variable.t -> t
 
 val tag_immediate : t -> t
@@ -498,6 +514,8 @@ module Descr : sig
         head_of_kind_naked_int64 Type_descr.Descr.t Or_unknown_or_bottom.t
     | Naked_nativeint of
         head_of_kind_naked_nativeint Type_descr.Descr.t Or_unknown_or_bottom.t
+    | Naked_vec128 of
+        head_of_kind_naked_vec128 Type_descr.Descr.t Or_unknown_or_bottom.t
     | Rec_info of
         head_of_kind_rec_info Type_descr.Descr.t Or_unknown_or_bottom.t
     | Region of head_of_kind_region Type_descr.Descr.t Or_unknown_or_bottom.t
@@ -517,6 +535,8 @@ val create_from_head_naked_int64 : head_of_kind_naked_int64 -> t
 
 val create_from_head_naked_nativeint : head_of_kind_naked_nativeint -> t
 
+val create_from_head_naked_vec128 : head_of_kind_naked_vec128 -> t
+
 val create_from_head_rec_info : head_of_kind_rec_info -> t
 
 val create_from_head_region : head_of_kind_region -> t
@@ -543,6 +563,11 @@ val apply_coercion_head_of_kind_naked_nativeint :
   Coercion.t ->
   head_of_kind_naked_nativeint Or_bottom.t
 
+val apply_coercion_head_of_kind_naked_vec128 :
+  head_of_kind_naked_vec128 ->
+  Coercion.t ->
+  head_of_kind_naked_vec128 Or_bottom.t
+
 val apply_coercion_head_of_kind_rec_info :
   head_of_kind_rec_info -> Coercion.t -> head_of_kind_rec_info Or_bottom.t
 
@@ -570,6 +595,8 @@ module Head_of_kind_value : sig
 
   val create_boxed_nativeint : flambda_type -> Alloc_mode.For_types.t -> t
 
+  val create_boxed_vec128 : flambda_type -> Alloc_mode.For_types.t -> t
+
   val create_tagged_immediate : Targetint_31_63.t -> t
 
   val create_closures : Row_like_for_closures.t -> Alloc_mode.For_types.t -> t
@@ -636,4 +663,10 @@ module Head_of_kind_naked_nativeint :
     with type n = Targetint_32_64.t
     with type n_set = Targetint_32_64.Set.t
 
+module Head_of_kind_naked_vec128 :
+  Head_of_kind_naked_number_intf
+    with type t = head_of_kind_naked_vec128
+    with type n = Numeric_types.Vec128_by_bit_pattern.t
+    with type n_set = Numeric_types.Vec128_by_bit_pattern.Set.t
+
 val recover_some_aliases : t -> t
diff --git a/middle_end/flambda2/types/meet_and_join.ml b/middle_end/flambda2/types/meet_and_join.ml
index cdb3aa1c099..bbcaf44bc3b 100644
--- a/middle_end/flambda2/types/meet_and_join.ml
+++ b/middle_end/flambda2/types/meet_and_join.ml
@@ -309,6 +309,11 @@ and meet_expanded_head0 env (descr1 : ET.descr) (descr2 : ET.descr) :
       meet_head_of_kind_naked_nativeint env head1 head2
     in
     ET.create_naked_nativeint head, env_extension
+  | Naked_vec128 head1, Naked_vec128 head2 ->
+    let<+ head, env_extension =
+      meet_head_of_kind_naked_vec128 env head1 head2
+    in
+    ET.create_naked_vec128 head, env_extension
   | Rec_info head1, Rec_info head2 ->
     let<+ head, env_extension = meet_head_of_kind_rec_info env head1 head2 in
     ET.create_rec_info head, env_extension
@@ -316,7 +321,8 @@ and meet_expanded_head0 env (descr1 : ET.descr) (descr2 : ET.descr) :
     let<+ head, env_extension = meet_head_of_kind_region env head1 head2 in
     ET.create_region head, env_extension
   | ( ( Value _ | Naked_immediate _ | Naked_float _ | Naked_int32 _
-      | Naked_int64 _ | Naked_nativeint _ | Rec_info _ | Region _ ),
+      | Naked_vec128 _ | Naked_int64 _ | Naked_nativeint _ | Rec_info _
+      | Region _ ),
       _ ) ->
     assert false
 
@@ -363,6 +369,10 @@ and meet_head_of_kind_value env (head1 : TG.head_of_kind_value)
     let<* n, env_extension = meet env n1 n2 in
     let<+ alloc_mode = meet_alloc_mode alloc_mode1 alloc_mode2 in
     TG.Head_of_kind_value.create_boxed_nativeint n alloc_mode, env_extension
+  | Boxed_vec128 (n1, alloc_mode1), Boxed_vec128 (n2, alloc_mode2) ->
+    let<* n, env_extension = meet env n1 n2 in
+    let<+ alloc_mode = meet_alloc_mode alloc_mode1 alloc_mode2 in
+    TG.Head_of_kind_value.create_boxed_vec128 n alloc_mode, env_extension
   | ( Closures { by_function_slot = by_function_slot1; alloc_mode = alloc_mode1 },
       Closures
         { by_function_slot = by_function_slot2; alloc_mode = alloc_mode2 } ) ->
@@ -404,7 +414,8 @@ and meet_head_of_kind_value env (head1 : TG.head_of_kind_value)
         contents alloc_mode,
       env_extension )
   | ( ( Variant _ | Mutable_block _ | Boxed_float _ | Boxed_int32 _
-      | Boxed_int64 _ | Boxed_nativeint _ | Closures _ | String _ | Array _ ),
+      | Boxed_vec128 _ | Boxed_int64 _ | Boxed_nativeint _ | Closures _
+      | String _ | Array _ ),
       _ ) ->
     (* This assumes that all the different constructors are incompatible. This
        could break very hard for dubious uses of Obj. *)
@@ -580,6 +591,10 @@ and meet_head_of_kind_naked_nativeint _env t1 t2 : _ Or_bottom.t =
   let<+ head = TG.Head_of_kind_naked_nativeint.inter t1 t2 in
   head, TEE.empty
 
+and meet_head_of_kind_naked_vec128 _env t1 t2 : _ Or_bottom.t =
+  let<+ head = TG.Head_of_kind_naked_vec128.inter t1 t2 in
+  head, TEE.empty
+
 and meet_head_of_kind_rec_info _env t1 _t2 : _ Or_bottom.t =
   (* CR-someday lmaurer: This could be doing things like discovering two depth
      variables are equal *)
@@ -1128,6 +1143,9 @@ and join_expanded_head env kind (expanded1 : ET.t) (expanded2 : ET.t) : ET.t =
       | Naked_nativeint head1, Naked_nativeint head2 ->
         let>+ head = join_head_of_kind_naked_nativeint env head1 head2 in
         ET.create_naked_nativeint head
+      | Naked_vec128 head1, Naked_vec128 head2 ->
+        let>+ head = join_head_of_kind_naked_vec128 env head1 head2 in
+        ET.create_naked_vec128 head
       | Rec_info head1, Rec_info head2 ->
         let>+ head = join_head_of_kind_rec_info env head1 head2 in
         ET.create_rec_info head
@@ -1135,7 +1153,8 @@ and join_expanded_head env kind (expanded1 : ET.t) (expanded2 : ET.t) : ET.t =
         let>+ head = join_head_of_kind_region env head1 head2 in
         ET.create_region head
       | ( ( Value _ | Naked_immediate _ | Naked_float _ | Naked_int32 _
-          | Naked_int64 _ | Naked_nativeint _ | Rec_info _ | Region _ ),
+          | Naked_vec128 _ | Naked_int64 _ | Naked_nativeint _ | Rec_info _
+          | Region _ ),
           _ ) ->
         assert false
     in
@@ -1176,6 +1195,10 @@ and join_head_of_kind_value env (head1 : TG.head_of_kind_value)
     let>+ n = join env n1 n2 in
     let alloc_mode = join_alloc_mode alloc_mode1 alloc_mode2 in
     TG.Head_of_kind_value.create_boxed_nativeint n alloc_mode
+  | Boxed_vec128 (n1, alloc_mode1), Boxed_vec128 (n2, alloc_mode2) ->
+    let>+ n = join env n1 n2 in
+    let alloc_mode = join_alloc_mode alloc_mode1 alloc_mode2 in
+    TG.Head_of_kind_value.create_boxed_vec128 n alloc_mode
   | ( Closures { by_function_slot = by_function_slot1; alloc_mode = alloc_mode1 },
       Closures
         { by_function_slot = by_function_slot2; alloc_mode = alloc_mode2 } ) ->
@@ -1209,7 +1232,8 @@ and join_head_of_kind_value env (head1 : TG.head_of_kind_value)
     TG.Head_of_kind_value.create_array_with_contents ~element_kind ~length
       contents alloc_mode
   | ( ( Variant _ | Mutable_block _ | Boxed_float _ | Boxed_int32 _
-      | Boxed_int64 _ | Boxed_nativeint _ | Closures _ | String _ | Array _ ),
+      | Boxed_vec128 _ | Boxed_int64 _ | Boxed_nativeint _ | Closures _
+      | String _ | Array _ ),
       _ ) ->
     Unknown
 
@@ -1315,6 +1339,9 @@ and join_head_of_kind_naked_int64 _env t1 t2 : _ Or_unknown.t =
 and join_head_of_kind_naked_nativeint _env t1 t2 : _ Or_unknown.t =
   Known (TG.Head_of_kind_naked_nativeint.union t1 t2)
 
+and join_head_of_kind_naked_vec128 _env t1 t2 : _ Or_unknown.t =
+  Known (TG.Head_of_kind_naked_vec128.union t1 t2)
+
 and join_head_of_kind_rec_info _env t1 t2 : _ Or_unknown.t =
   if Rec_info_expr.equal t1 t2 then Known t1 else Unknown
 
diff --git a/middle_end/flambda2/types/provers.ml b/middle_end/flambda2/types/provers.ml
index 570f142dc01..707a3aea834 100644
--- a/middle_end/flambda2/types/provers.ml
+++ b/middle_end/flambda2/types/provers.ml
@@ -96,12 +96,13 @@ let prove_is_int_generic env t : bool generic_proof =
   | Value
       (Ok
         ( Mutable_block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _
-        | Boxed_nativeint _ | Closures _ | String _ | Array _ )) ->
+        | Boxed_vec128 _ | Boxed_nativeint _ | Closures _ | String _ | Array _
+          )) ->
     Proved false
   | Value Unknown -> Unknown
   | Value Bottom -> Invalid
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_nativeint _ | Naked_vec128 _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let prove_is_int env t = as_property (prove_is_int_generic env t)
@@ -130,8 +131,9 @@ let prove_get_tag_generic env t : Tag.Set.t generic_proof =
           | Known tags -> if Tag.Set.is_empty tags then Invalid else Proved tags
           )))
   | Value
-      (Ok (Boxed_float _ | Boxed_int32 _ | Boxed_int64 _ | Boxed_nativeint _))
-    ->
+      (Ok
+        ( Boxed_float _ | Boxed_int32 _ | Boxed_int64 _ | Boxed_nativeint _
+        | Boxed_vec128 _ )) ->
     Unknown
   | Value (Ok (Mutable_block _)) -> Unknown
   | Value (Ok (Closures _)) -> Unknown
@@ -140,7 +142,7 @@ let prove_get_tag_generic env t : Tag.Set.t generic_proof =
   | Value Unknown -> Unknown
   | Value Bottom -> Invalid
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let prove_get_tag env t = as_property (prove_get_tag_generic env t)
@@ -172,7 +174,7 @@ let prove_naked_immediates_generic env t : Targetint_31_63.Set.t generic_proof =
   | Naked_immediate Unknown -> Unknown
   | Naked_immediate Bottom -> Invalid
   | Value _ | Naked_float _ | Naked_int32 _ | Naked_int64 _ | Naked_nativeint _
-  | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Rec_info _ | Region _ ->
     wrong_kind "Naked_immediate" t
 
 let meet_naked_immediates env t =
@@ -196,7 +198,7 @@ let prove_equals_tagged_immediates env t : _ proof_of_property =
       else Unknown)
   | Value (Ok _ | Unknown | Bottom) -> Unknown
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let meet_equals_tagged_immediates env t : _ meet_shortcut =
@@ -208,12 +210,13 @@ let meet_equals_tagged_immediates env t : _ meet_shortcut =
   | Value
       (Ok
         ( Mutable_block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _
-        | Boxed_nativeint _ | Closures _ | String _ | Array _ )) ->
+        | Boxed_vec128 _ | Boxed_nativeint _ | Closures _ | String _ | Array _
+          )) ->
     Invalid
   | Value Unknown -> Need_meet
   | Value Bottom
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     Invalid
 
 let meet_equals_single_tagged_immediate env t : _ meet_shortcut =
@@ -230,6 +233,7 @@ type _ meet_naked_number_kind =
   | Int32 : Int32.Set.t meet_naked_number_kind
   | Int64 : Int64.Set.t meet_naked_number_kind
   | Nativeint : Targetint_32_64.Set.t meet_naked_number_kind
+  | Vec128 : Numeric_types.Vec128_by_bit_pattern.Set.t meet_naked_number_kind
 
 let[@inline] meet_naked_number (type a) (kind : a meet_naked_number_kind) env t
     : a meet_shortcut =
@@ -249,6 +253,7 @@ let[@inline] meet_naked_number (type a) (kind : a meet_naked_number_kind) env t
       | Int32 -> "Naked_int32"
       | Int64 -> "Naked_int64"
       | Nativeint -> "Naked_nativeint"
+      | Vec128 -> "Naked_vec128"
     in
     wrong_kind kind_string t
   in
@@ -286,6 +291,14 @@ let[@inline] meet_naked_number (type a) (kind : a meet_naked_number_kind) env t
           (is :> Targetint_32_64.Set.t))
         ~is_empty:Targetint_32_64.Set.is_empty
     | _ -> wrong_kind ())
+  | Naked_vec128 vs -> (
+    match kind with
+    | Vec128 ->
+      head_to_proof vs
+        (fun (fs : TG.head_of_kind_naked_vec128) ->
+          (fs :> Numeric_types.Vec128_by_bit_pattern.Set.t))
+        ~is_empty:Numeric_types.Vec128_by_bit_pattern.Set.is_empty
+    | _ -> wrong_kind ())
 
 let meet_naked_floats = meet_naked_number Float
 
@@ -295,6 +308,8 @@ let meet_naked_int64s = meet_naked_number Int64
 
 let meet_naked_nativeints = meet_naked_number Nativeint
 
+let meet_naked_vec128s = meet_naked_number Vec128
+
 type variant_like_proof =
   { const_ctors : Targetint_31_63.Set.t Or_unknown.t;
     non_const_ctors_with_sizes : Targetint_31_63.t Tag.Scannable.Map.t
@@ -342,12 +357,12 @@ let prove_variant_like_generic env t : variant_like_proof generic_proof =
   | Value
       (Ok
         ( Closures _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _
-        | Boxed_nativeint _ | String _ )) ->
+        | Boxed_vec128 _ | Boxed_nativeint _ | String _ )) ->
     Invalid
   | Value Unknown -> Unknown
   | Value Bottom -> Invalid
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let meet_variant_like env t =
@@ -380,10 +395,12 @@ let prove_is_a_boxed_or_tagged_number env t :
     Proved (Boxed (alloc_mode, Naked_int64, contents_ty))
   | Value (Ok (Boxed_nativeint (contents_ty, alloc_mode))) ->
     Proved (Boxed (alloc_mode, Naked_nativeint, contents_ty))
+  | Value (Ok (Boxed_vec128 (contents_ty, alloc_mode))) ->
+    Proved (Boxed (alloc_mode, Naked_vec128, contents_ty))
   | Value (Bottom | Ok (Mutable_block _ | Closures _ | String _ | Array _)) ->
     Unknown
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let prove_is_a_tagged_immediate env t : _ proof_of_property =
@@ -398,7 +415,7 @@ let prove_is_a_boxed_float env t : _ proof_of_property =
   | Value (Ok (Boxed_float _)) -> Proved ()
   | Value _ -> Unknown
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let prove_is_or_is_not_a_boxed_float env t : _ proof_of_property =
@@ -408,7 +425,7 @@ let prove_is_or_is_not_a_boxed_float env t : _ proof_of_property =
   | Value (Ok (Boxed_float _)) -> Proved true
   | Value (Ok _) -> Proved false
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let prove_is_a_boxed_int32 env t : _ proof_of_property =
@@ -417,7 +434,7 @@ let prove_is_a_boxed_int32 env t : _ proof_of_property =
   | Value (Ok (Boxed_int32 _)) -> Proved ()
   | Value _ -> Unknown
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let prove_is_a_boxed_int64 env t : _ proof_of_property =
@@ -426,7 +443,7 @@ let prove_is_a_boxed_int64 env t : _ proof_of_property =
   | Value (Ok (Boxed_int64 _)) -> Proved ()
   | Value _ -> Unknown
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let prove_is_a_boxed_nativeint env t : _ proof_of_property =
@@ -435,7 +452,16 @@ let prove_is_a_boxed_nativeint env t : _ proof_of_property =
   | Value (Ok (Boxed_nativeint _)) -> Proved ()
   | Value _ -> Unknown
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
+    wrong_kind "Value" t
+
+let prove_is_a_boxed_vec128 env t : _ proof_of_property =
+  match expand_head env t with
+  | Value Unknown -> Unknown
+  | Value (Ok (Boxed_vec128 _)) -> Proved ()
+  | Value _ -> Unknown
+  | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let prove_unique_tag_and_size0 env t :
@@ -460,7 +486,7 @@ let prove_unique_tag_and_size0 env t :
     ->
     Unknown
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let prove_unique_tag_and_size env t :
@@ -505,20 +531,21 @@ let meet_is_flat_float_array env t : bool meet_shortcut =
     | Value -> Known_result false
     | Naked_number Naked_float -> Known_result true
     | Naked_number
-        (Naked_immediate | Naked_int32 | Naked_int64 | Naked_nativeint)
+        ( Naked_immediate | Naked_int32 | Naked_int64 | Naked_nativeint
+        | Naked_vec128 )
     | Region | Rec_info ->
       Misc.fatal_errorf "Wrong element kind for array: %a" K.With_subkind.print
         element_kind)
   | Value
       (Ok
         ( Boxed_float _ | Boxed_int32 _ | Boxed_int64 _ | Boxed_nativeint _
-        | Closures _ | String _ )) ->
+        | Boxed_vec128 _ | Closures _ | String _ )) ->
     Invalid
   | Value (Ok (Variant _ | Mutable_block _)) ->
     (* In case of untyped code using array primitives on regular blocks *)
     Need_meet
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     Misc.fatal_errorf "Kind error: expected [Value]:@ %a" TG.print t
 
 let prove_is_immediates_array env t : unit proof_of_property =
@@ -533,16 +560,17 @@ let prove_is_immediates_array env t : unit proof_of_property =
     match K.With_subkind.subkind element_kind with
     | Tagged_immediate -> Proved ()
     | Anything | Boxed_float | Boxed_int32 | Boxed_int64 | Boxed_nativeint
-    | Variant _ | Float_block _ | Float_array | Immediate_array | Value_array
-    | Generic_array ->
+    | Boxed_vec128 | Variant _ | Float_block _ | Float_array | Immediate_array
+    | Value_array | Generic_array ->
       Unknown)
   | Value
       (Ok
         ( Variant _ | Mutable_block _ | Boxed_float _ | Boxed_int32 _
-        | Boxed_int64 _ | Boxed_nativeint _ | Closures _ | String _ )) ->
+        | Boxed_vec128 _ | Boxed_int64 _ | Boxed_nativeint _ | Closures _
+        | String _ )) ->
     Unknown
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let prove_single_closures_entry_generic env t : _ generic_proof =
@@ -566,12 +594,13 @@ let prove_single_closures_entry_generic env t : _ generic_proof =
   | Value
       (Ok
         ( Variant _ | Mutable_block _ | Boxed_float _ | Boxed_int32 _
-        | Boxed_int64 _ | Boxed_nativeint _ | String _ | Array _ )) ->
+        | Boxed_vec128 _ | Boxed_int64 _ | Boxed_nativeint _ | String _
+        | Array _ )) ->
     Invalid
   | Value Unknown -> Unknown
   | Value Bottom -> Invalid
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let meet_single_closures_entry env t =
@@ -588,7 +617,7 @@ let meet_is_immutable_array env t : _ meet_shortcut =
     | Unknown -> Need_meet)
   | Value (Ok _)
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     Invalid
 
 let prove_single_closures_entry env t =
@@ -601,7 +630,7 @@ let meet_strings env t : String_info.Set.t meet_shortcut =
   | Value Unknown -> Need_meet
   | Value Bottom -> Invalid
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     Misc.fatal_errorf "Kind error: expected [Value]:@ %a" TG.print t
 
 let prove_strings env t : _ proof_of_property =
@@ -612,7 +641,7 @@ let prove_strings env t : _ proof_of_property =
     Proved (Alloc_mode.For_types.heap, strs)
   | Value (Ok _ | Unknown | Bottom) -> Unknown
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     Misc.fatal_errorf "Kind error: expected [Value]:@ %a" TG.print t
 
 type tagging_proof_kind =
@@ -655,7 +684,7 @@ let[@inline always] inspect_tagging_of_simple proof_kind env ~min_name_mode t :
     | Meet, _ -> inspect_immediates ())
   | Value _ -> Unknown
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let prove_tagging_of_simple env ~min_name_mode t =
@@ -679,7 +708,7 @@ let[@inline always] meet_boxed_number_containing_simple
   | Value Unknown -> Need_meet
   | Value Bottom -> Invalid
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     Misc.fatal_errorf "Kind error: expected [Value]:@ %a" TG.print t
 
 let meet_boxed_float_containing_simple =
@@ -688,7 +717,7 @@ let meet_boxed_float_containing_simple =
       match ty_value with
       | Boxed_float (ty, _) -> Some ty
       | Variant _ | Mutable_block _ | Boxed_int32 _ | Boxed_int64 _
-      | Boxed_nativeint _ | Closures _ | String _ | Array _ ->
+      | Boxed_vec128 _ | Boxed_nativeint _ | Closures _ | String _ | Array _ ->
         None)
 
 let meet_boxed_int32_containing_simple =
@@ -697,7 +726,7 @@ let meet_boxed_int32_containing_simple =
       match ty_value with
       | Boxed_int32 (ty, _) -> Some ty
       | Variant _ | Mutable_block _ | Boxed_float _ | Boxed_int64 _
-      | Boxed_nativeint _ | Closures _ | String _ | Array _ ->
+      | Boxed_vec128 _ | Boxed_nativeint _ | Closures _ | String _ | Array _ ->
         None)
 
 let meet_boxed_int64_containing_simple =
@@ -706,7 +735,7 @@ let meet_boxed_int64_containing_simple =
       match ty_value with
       | Boxed_int64 (ty, _) -> Some ty
       | Variant _ | Mutable_block _ | Boxed_float _ | Boxed_int32 _
-      | Boxed_nativeint _ | Closures _ | String _ | Array _ ->
+      | Boxed_vec128 _ | Boxed_nativeint _ | Closures _ | String _ | Array _ ->
         None)
 
 let meet_boxed_nativeint_containing_simple =
@@ -715,7 +744,16 @@ let meet_boxed_nativeint_containing_simple =
       match ty_value with
       | Boxed_nativeint (ty, _) -> Some ty
       | Variant _ | Mutable_block _ | Boxed_float _ | Boxed_int32 _
-      | Boxed_int64 _ | Closures _ | String _ | Array _ ->
+      | Boxed_vec128 _ | Boxed_int64 _ | Closures _ | String _ | Array _ ->
+        None)
+
+let meet_boxed_vec128_containing_simple =
+  meet_boxed_number_containing_simple
+    ~contents_of_boxed_number:(fun (ty_value : TG.head_of_kind_value) ->
+      match ty_value with
+      | Boxed_vec128 (ty, _) -> Some ty
+      | Variant _ | Mutable_block _ | Boxed_float _ | Boxed_int32 _
+      | Boxed_nativeint _ | Boxed_int64 _ | Closures _ | String _ | Array _ ->
         None)
 
 let meet_block_field_simple env ~min_name_mode ~field_kind t field_index :
@@ -751,7 +789,7 @@ let meet_block_field_simple env ~min_name_mode ~field_kind t field_index :
   | Value Unknown -> Need_meet
   | Value Bottom -> Invalid
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let meet_project_function_slot_simple env ~min_name_mode t function_slot :
@@ -773,7 +811,7 @@ let meet_project_function_slot_simple env ~min_name_mode t function_slot :
   | Value Unknown -> Need_meet
   | Value Bottom -> Invalid
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let meet_project_value_slot_simple env ~min_name_mode t value_slot :
@@ -800,7 +838,7 @@ let meet_project_value_slot_simple env ~min_name_mode t value_slot :
   | Value Unknown -> Need_meet
   | Value Bottom -> Invalid
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let meet_rec_info env t : Rec_info_expr.t meet_shortcut =
@@ -809,7 +847,7 @@ let meet_rec_info env t : Rec_info_expr.t meet_shortcut =
   | Rec_info Unknown -> Need_meet
   | Rec_info Bottom -> Invalid
   | Value _ | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Region _ ->
     wrong_kind "Rec_info" t
 
 let prove_alloc_mode_of_boxed_number env t :
@@ -818,13 +856,14 @@ let prove_alloc_mode_of_boxed_number env t :
   | Value (Ok (Boxed_float (_, alloc_mode)))
   | Value (Ok (Boxed_int32 (_, alloc_mode)))
   | Value (Ok (Boxed_int64 (_, alloc_mode)))
-  | Value (Ok (Boxed_nativeint (_, alloc_mode))) ->
+  | Value (Ok (Boxed_nativeint (_, alloc_mode)))
+  | Value (Ok (Boxed_vec128 (_, alloc_mode))) ->
     Proved alloc_mode
   | Value (Ok (Variant _ | Mutable_block _ | String _ | Array _ | Closures _))
   | Value (Unknown | Bottom) ->
     Unknown
   | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-  | Naked_nativeint _ | Rec_info _ | Region _ ->
+  | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
     wrong_kind "Value" t
 
 let never_holds_locally_allocated_values env var : _ proof_of_property =
@@ -846,6 +885,7 @@ let never_holds_locally_allocated_values env var : _ proof_of_property =
     | Value (Ok (Boxed_int32 (_, alloc_mode)))
     | Value (Ok (Boxed_int64 (_, alloc_mode)))
     | Value (Ok (Boxed_nativeint (_, alloc_mode)))
+    | Value (Ok (Boxed_vec128 (_, alloc_mode)))
     | Value (Ok (Mutable_block { alloc_mode }))
     | Value (Ok (Closures { alloc_mode; _ }))
     | Value (Ok (Array { alloc_mode; _ })) -> (
@@ -856,7 +896,7 @@ let never_holds_locally_allocated_values env var : _ proof_of_property =
     | Value Unknown -> Unknown
     | Value Bottom -> Unknown
     | Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-    | Naked_nativeint _ | Rec_info _ | Region _ ->
+    | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ->
       Proved ())
 
 let prove_physical_equality env t1 t2 =
@@ -874,17 +914,24 @@ let prove_physical_equality env t1 t2 =
     | Naked_nativeint (Ok s1), Naked_nativeint (Ok s2) ->
       let module IS = Targetint_32_64.Set in
       IS.is_empty (IS.inter (s1 :> IS.t) (s2 :> IS.t))
-    | _, _ -> false
+    | Naked_vec128 (Ok s1), Naked_vec128 (Ok s2) ->
+      let module IS = Numeric_types.Vec128_by_bit_pattern.Set in
+      IS.is_empty (IS.inter (s1 :> IS.t) (s2 :> IS.t))
+    | ( ( Naked_float _ | Naked_int32 _ | Naked_int64 _ | Naked_nativeint _
+        | Naked_vec128 _ | Value _ | Naked_immediate _ | Region _ | Rec_info _
+          ),
+        _ ) ->
+      false
   in
   let check_heads () : _ proof_of_property =
     match expand_head env t1, expand_head env t2 with
     | ( ( Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-        | Naked_nativeint _ | Rec_info _ | Region _ ),
+        | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ),
         _ ) ->
       wrong_kind "Value" t1
     | ( _,
         ( Naked_immediate _ | Naked_float _ | Naked_int32 _ | Naked_int64 _
-        | Naked_nativeint _ | Rec_info _ | Region _ ) ) ->
+        | Naked_vec128 _ | Naked_nativeint _ | Rec_info _ | Region _ ) ) ->
       wrong_kind "Value" t2
     | Value (Unknown | Bottom), _ | _, Value (Unknown | Bottom) -> Unknown
     | Value (Ok head1), Value (Ok head2) -> (
@@ -901,6 +948,8 @@ let prove_physical_equality env t1 t2 =
         if incompatible_naked_numbers t1 t2 then Proved false else Unknown
       | Boxed_nativeint (t1, _), Boxed_nativeint (t2, _) ->
         if incompatible_naked_numbers t1 t2 then Proved false else Unknown
+      | Boxed_vec128 (t1, _), Boxed_vec128 (t2, _) ->
+        if incompatible_naked_numbers t1 t2 then Proved false else Unknown
       | Closures _, Closures _ -> Unknown
       | String s1, String s2 ->
         let module SS = String_info.Set in
@@ -908,9 +957,11 @@ let prove_physical_equality env t1 t2 =
       (* Immediates and allocated values -> Proved false *)
       | ( Variant { immediates = _; blocks = Known blocks; is_unique = _ },
           ( Mutable_block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _
-          | Boxed_nativeint _ | Closures _ | String _ | Array _ ) )
+          | Boxed_vec128 _ | Boxed_nativeint _ | Closures _ | String _ | Array _
+            ) )
       | ( ( Mutable_block _ | Boxed_float _ | Boxed_int32 _ | Boxed_int64 _
-          | Boxed_nativeint _ | Closures _ | String _ | Array _ ),
+          | Boxed_vec128 _ | Boxed_nativeint _ | Closures _ | String _ | Array _
+            ),
           Variant { immediates = _; blocks = Known blocks; is_unique = _ } )
         when TG.Row_like_for_blocks.is_bottom blocks ->
         Proved false
@@ -997,22 +1048,30 @@ let prove_physical_equality env t1 t2 =
       (* Boxed numbers with non-numbers or different kinds -> Proved *)
       | ( Boxed_float _,
           ( Variant _ | Mutable_block _ | Boxed_int32 _ | Boxed_int64 _
-          | Boxed_nativeint _ | Closures _ | String _ | Array _ ) )
+          | Boxed_vec128 _ | Boxed_nativeint _ | Closures _ | String _ | Array _
+            ) )
       | ( ( Variant _ | Mutable_block _ | Boxed_int32 _ | Boxed_int64 _
-          | Boxed_nativeint _ | Closures _ | String _ | Array _ ),
+          | Boxed_vec128 _ | Boxed_nativeint _ | Closures _ | String _ | Array _
+            ),
           Boxed_float _ )
       | ( Boxed_int32 _,
           ( Variant _ | Mutable_block _ | Boxed_int64 _ | Boxed_nativeint _
-          | Closures _ | String _ | Array _ ) )
+          | Boxed_vec128 _ | Closures _ | String _ | Array _ ) )
       | ( ( Variant _ | Mutable_block _ | Boxed_int64 _ | Boxed_nativeint _
-          | Closures _ | String _ | Array _ ),
+          | Boxed_vec128 _ | Closures _ | String _ | Array _ ),
           Boxed_int32 _ )
       | ( Boxed_int64 _,
+          ( Variant _ | Mutable_block _ | Boxed_nativeint _ | Closures _
+          | Boxed_vec128 _ | String _ | Array _ ) )
+      | ( ( Variant _ | Mutable_block _ | Boxed_nativeint _ | Closures _
+          | Boxed_vec128 _ | String _ | Array _ ),
+          Boxed_int64 _ )
+      | ( Boxed_vec128 _,
           ( Variant _ | Mutable_block _ | Boxed_nativeint _ | Closures _
           | String _ | Array _ ) )
       | ( ( Variant _ | Mutable_block _ | Boxed_nativeint _ | Closures _
           | String _ | Array _ ),
-          Boxed_int64 _ )
+          Boxed_vec128 _ )
       | ( Boxed_nativeint _,
           (Variant _ | Mutable_block _ | Closures _ | String _ | Array _) )
       | ( (Variant _ | Mutable_block _ | Closures _ | String _ | Array _),
diff --git a/middle_end/flambda2/types/provers.mli b/middle_end/flambda2/types/provers.mli
index 1525fdc0302..66e843fbe14 100644
--- a/middle_end/flambda2/types/provers.mli
+++ b/middle_end/flambda2/types/provers.mli
@@ -62,6 +62,11 @@ val meet_naked_int64s :
 val meet_naked_nativeints :
   Typing_env.t -> Type_grammar.t -> Targetint_32_64.Set.t meet_shortcut
 
+val meet_naked_vec128s :
+  Typing_env.t ->
+  Type_grammar.t ->
+  Numeric_types.Vec128_by_bit_pattern.Set.t meet_shortcut
+
 type variant_like_proof = private
   { const_ctors : Targetint_31_63.Set.t Or_unknown.t;
     non_const_ctors_with_sizes : Targetint_31_63.t Tag.Scannable.Map.t
@@ -96,6 +101,9 @@ val prove_is_a_boxed_int64 :
 val prove_is_a_boxed_nativeint :
   Typing_env.t -> Type_grammar.t -> unit proof_of_property
 
+val prove_is_a_boxed_vec128 :
+  Typing_env.t -> Type_grammar.t -> unit proof_of_property
+
 val prove_is_or_is_not_a_boxed_float :
   Typing_env.t -> Type_grammar.t -> bool proof_of_property
 
@@ -190,6 +198,12 @@ val meet_boxed_nativeint_containing_simple :
   Type_grammar.t ->
   Simple.t meet_shortcut
 
+val meet_boxed_vec128_containing_simple :
+  Typing_env.t ->
+  min_name_mode:Name_mode.t ->
+  Type_grammar.t ->
+  Simple.t meet_shortcut
+
 val meet_block_field_simple :
   Typing_env.t ->
   min_name_mode:Name_mode.t ->
diff --git a/middle_end/flambda2/types/reify.ml b/middle_end/flambda2/types/reify.ml
index 40f14314581..f9ca741ffbc 100644
--- a/middle_end/flambda2/types/reify.ml
+++ b/middle_end/flambda2/types/reify.ml
@@ -30,6 +30,7 @@ type to_lift =
   | Boxed_int32 of Int32.t
   | Boxed_int64 of Int64.t
   | Boxed_nativeint of Targetint_32_64.t
+  | Boxed_vec128 of Numeric_types.Vec128_by_bit_pattern.t
   | Immutable_float_array of { fields : Float.t list }
   | Immutable_value_array of { fields : Simple.t list }
   | Empty_array
@@ -58,7 +59,7 @@ let try_to_reify_fields env ~var_allowed alloc_mode ~field_types =
               match Reg_width_const.descr const with
               | Tagged_immediate _imm -> Some simple
               | Naked_immediate _ | Naked_float _ | Naked_int32 _
-              | Naked_int64 _ | Naked_nativeint _ ->
+              | Naked_vec128 _ | Naked_int64 _ | Naked_nativeint _ ->
                 (* This should never happen, as we should have got a kind error
                    instead *)
                 None)
@@ -315,6 +316,13 @@ let reify ~allowed_if_free_vars_defined_in ~var_is_defined_at_toplevel
       match Targetint_32_64.Set.get_singleton (ns :> Targetint_32_64.Set.t) with
       | None -> try_canonical_simple ()
       | Some n -> Simple (Simple.const (Reg_width_const.naked_nativeint n)))
+    | Naked_vec128 (Ok ns) -> (
+      match
+        Numeric_types.Vec128_by_bit_pattern.Set.get_singleton
+          (ns :> Numeric_types.Vec128_by_bit_pattern.Set.t)
+      with
+      | None -> try_canonical_simple ()
+      | Some n -> Simple (Simple.const (Reg_width_const.naked_vec128 n)))
     (* CR-someday mshinwell: These could lift at toplevel when [ty_naked_float]
        is an alias type. That would require checking the alloc mode. *)
     | Value (Ok (Boxed_float (ty_naked_float, _alloc_mode))) -> (
@@ -353,6 +361,14 @@ let reify ~allowed_if_free_vars_defined_in ~var_is_defined_at_toplevel
         match Targetint_32_64.Set.get_singleton ns with
         | None -> try_canonical_simple ()
         | Some n -> Lift (Boxed_nativeint n)))
+    | Value (Ok (Boxed_vec128 (ty_naked_vec128, _alloc_mode))) -> (
+      match Provers.meet_naked_vec128s env ty_naked_vec128 with
+      | Need_meet -> try_canonical_simple ()
+      | Invalid -> Invalid
+      | Known_result ns -> (
+        match Numeric_types.Vec128_by_bit_pattern.Set.get_singleton ns with
+        | None -> try_canonical_simple ()
+        | Some n -> Lift (Boxed_vec128 n)))
     | Value
         (Ok
           (Array
@@ -417,7 +433,8 @@ let reify ~allowed_if_free_vars_defined_in ~var_is_defined_at_toplevel
             | Ok fields_rev ->
               Lift (Immutable_float_array { fields = List.rev fields_rev }))
           | Naked_number
-              (Naked_immediate | Naked_int32 | Naked_int64 | Naked_nativeint)
+              ( Naked_immediate | Naked_int32 | Naked_int64 | Naked_nativeint
+              | Naked_vec128 )
           | Region | Rec_info ->
             Misc.fatal_errorf
               "Unexpected kind %a in immutable array case when reifying type:@ \
@@ -429,6 +446,7 @@ let reify ~allowed_if_free_vars_defined_in ~var_is_defined_at_toplevel
     | Naked_int32 Bottom
     | Naked_int64 Bottom
     | Naked_nativeint Bottom
+    | Naked_vec128 Bottom
     | Rec_info Bottom
     | Region Bottom ->
       Invalid
@@ -438,6 +456,7 @@ let reify ~allowed_if_free_vars_defined_in ~var_is_defined_at_toplevel
     | Naked_float Unknown
     | Naked_int32 Unknown
     | Naked_int64 Unknown
+    | Naked_vec128 Unknown
     | Naked_nativeint Unknown
     | Rec_info Unknown
     | Region (Unknown | Ok _)
diff --git a/middle_end/flambda2/types/reify.mli b/middle_end/flambda2/types/reify.mli
index 98281f3c23a..708285d0a45 100644
--- a/middle_end/flambda2/types/reify.mli
+++ b/middle_end/flambda2/types/reify.mli
@@ -27,6 +27,7 @@ type to_lift = private
   | Boxed_int32 of Numeric_types.Int32.t
   | Boxed_int64 of Numeric_types.Int64.t
   | Boxed_nativeint of Targetint_32_64.t
+  | Boxed_vec128 of Numeric_types.Vec128_by_bit_pattern.t
   | Immutable_float_array of
       { fields : Numeric_types.Float_by_bit_pattern.t list }
   | Immutable_value_array of { fields : Simple.t list }
diff --git a/middle_end/printclambda.ml b/middle_end/printclambda.ml
index 984c31715c9..923fcaf112f 100644
--- a/middle_end/printclambda.ml
+++ b/middle_end/printclambda.ml
@@ -37,6 +37,7 @@ let rec value_kind0 ppf kind =
   | Pboxedintval Pnativeint -> Format.pp_print_string ppf ":nativeint"
   | Pboxedintval Pint32 -> Format.pp_print_string ppf ":int32"
   | Pboxedintval Pint64 -> Format.pp_print_string ppf ":int64"
+  | Pboxedvectorval Pvec128 -> Format.pp_print_string ppf ":vec128"
   | Pvariant { consts; non_consts } ->
     Format.fprintf ppf "@[<hov 1>[(consts (%a))@ (non_consts (%a))]@]"
       (Format.pp_print_list ~pp_sep:Format.pp_print_space Format.pp_print_int)
@@ -60,12 +61,14 @@ let layout (layout : Lambda.layout) =
   | Punboxed_int Pint32 -> ":unboxed_int32"
   | Punboxed_int Pint64 -> ":unboxed_int64"
   | Punboxed_int Pnativeint -> ":unboxed_nativeint"
+  | Punboxed_vector Pvec128 -> ":unboxed_vec128"
 
 let rec structured_constant ppf = function
   | Uconst_float x -> fprintf ppf "%F" x
   | Uconst_int32 x -> fprintf ppf "%ldl" x
   | Uconst_int64 x -> fprintf ppf "%LdL" x
   | Uconst_nativeint x -> fprintf ppf "%ndn" x
+  | Uconst_vec128 {high; low} -> fprintf ppf "%016Lx:%016Lx" high low
   | Uconst_block (tag, l) ->
       fprintf ppf "block(%i" tag;
       List.iter (fun u -> fprintf ppf ",%a" uconstant u) l;
diff --git a/ocaml/Makefile.common-jst b/ocaml/Makefile.common-jst
index a9f46fe2315..acd7a7689db 100644
--- a/ocaml/Makefile.common-jst
+++ b/ocaml/Makefile.common-jst
@@ -198,6 +198,9 @@ install_for_test: _install
 	 # replace backend-specific testsuite/tests/asmgen with their new versions
 	rm _runtest/testsuite/tests/asmgen/*
 	cp -a testsuite/tests/asmgen/* _runtest/testsuite/tests/asmgen/
+	 # replace backend-specific testsuite/tests/unboxed-primitive-args with their new versions
+	rm _runtest/testsuite/tests/unboxed-primitive-args/*
+	cp -a testsuite/tests/unboxed-primitive-args/* _runtest/testsuite/tests/unboxed-primitive-args/
 
 	cp $(ocamldir)/Makefile.* _runtest/
 
diff --git a/ocaml/asmcomp/amd64/emit.mlp b/ocaml/asmcomp/amd64/emit.mlp
index 39ccf590568..c69a07418b5 100644
--- a/ocaml/asmcomp/amd64/emit.mlp
+++ b/ocaml/asmcomp/amd64/emit.mlp
@@ -1210,7 +1210,7 @@ let probe_env p =
   env.stack_offset <- p.probe_stack_offset;
   (* Account for the return address that is now pushed on the stack. *)
   env.stack_offset <- env.stack_offset + 8;
-  env  
+  env
 
 let emit_probe_handler_wrapper p =
   let wrap_label = probe_handler_wrapper_name p.probe_label in
diff --git a/ocaml/asmcomp/cmm_helpers.ml b/ocaml/asmcomp/cmm_helpers.ml
index 5f6edb8e368..46a559aed81 100644
--- a/ocaml/asmcomp/cmm_helpers.ml
+++ b/ocaml/asmcomp/cmm_helpers.ml
@@ -919,6 +919,8 @@ module Extended_machtype = struct
       (* Only 64-bit architectures, so this is always [typ_int] *)
       typ_any_int
     | Pvalue Pintval -> typ_tagged_int
+    | Punboxed_vector _ ->
+      Misc.fatal_error "SIMD vectors are not yet suppored in the upstream compiler build."
     | Pvalue _ -> typ_val
 end
 
@@ -3283,5 +3285,8 @@ let kind_of_layout (layout : Lambda.layout) =
   | Pvalue (Pboxedintval bi) -> Boxed_integer bi
   | Pvalue (Pgenval | Pintval | Pvariant _ | Parrayval _)
   | Ptop | Pbottom | Punboxed_float | Punboxed_int _ -> Any
+  | Pvalue (Pboxedvectorval _)
+  | Punboxed_vector _ ->
+    Misc.fatal_error "SIMD vectors are not yet suppored in the upstream compiler build."
 
 let make_tuple l = match l with [e] -> e | _ -> Ctuple l
diff --git a/ocaml/asmcomp/cmmgen.ml b/ocaml/asmcomp/cmmgen.ml
index cb576e08247..a6b7dc5b20e 100644
--- a/ocaml/asmcomp/cmmgen.ml
+++ b/ocaml/asmcomp/cmmgen.ml
@@ -124,6 +124,8 @@ let get_field env layout ptr n dbg =
     | Pvalue Pintval | Punboxed_int _ -> Word_int
     | Pvalue _ -> Word_val
     | Punboxed_float -> Double
+    | Punboxed_vector _ ->
+      Misc.fatal_error "SIMD vectors are not yet suppored in the upstream compiler build."
     | Ptop ->
         Misc.fatal_errorf "get_field with Ptop: %a" Debuginfo.print_compact dbg
     | Pbottom ->
@@ -842,6 +844,8 @@ and transl_ccall env prim args dbg =
           | Pint32 -> XInt32
           | Pint64 -> XInt64 in
         (xty, transl_unbox_int dbg env bi arg)
+    | Unboxed_vector _ ->
+      Misc.fatal_error "SIMD vectors are not yet suppored in the upstream compiler build."
     | Untagged_int ->
         (XInt, untag_int (transl env arg) dbg)
   in
@@ -871,6 +875,8 @@ and transl_ccall env prim args dbg =
         ([|Int; Int|], box_int dbg Pint64 alloc_heap)
     | _, Unboxed_integer bi -> (typ_int, box_int dbg bi alloc_heap)
     | _, Untagged_int -> (typ_int, (fun i -> tag_int i dbg))
+    | _, Unboxed_vector _ ->
+      Misc.fatal_error "SIMD vectors are not yet suppored in the upstream compiler build."
   in
   let typ_args, args = transl_args prim.prim_native_repr_args args in
   wrap_result
@@ -1244,6 +1250,8 @@ and transl_let_value env str (kind : Lambda.value_kind) id exp transl_body =
         Boxed (Boxed_float (alloc_heap, dbg), false)
     | Mutable, Pboxedintval bi ->
         Boxed (Boxed_integer (bi, alloc_heap, dbg), false)
+    | _, Pboxedvectorval _ ->
+        Misc.fatal_error "SIMD vectors are not yet suppored in the upstream compiler build."
     | _, (Pfloatval | Pboxedintval _) ->
         (* It would be safe to always unbox in this case, but
            we do it only if this indeed allows us to get rid of
@@ -1290,6 +1298,8 @@ and transl_let env str (layout : Lambda.layout) id exp transl_body =
          there may be constant closures inside that need lifting out. *)
       let _cbody : expression = transl_body env in
       cexp
+  | Punboxed_vector _ ->
+      Misc.fatal_error "SIMD vectors are not yet suppored in the upstream compiler build."
   | Punboxed_float | Punboxed_int _ -> begin
       let cexp = transl env exp in
       let cbody = transl_body env in
diff --git a/ocaml/boot/ocamlc b/ocaml/boot/ocamlc
index b0ee3819b10..fcfac0201e4 100755
Binary files a/ocaml/boot/ocamlc and b/ocaml/boot/ocamlc differ
diff --git a/ocaml/boot/ocamllex b/ocaml/boot/ocamllex
index 6da593d2da4..da767d2e389 100755
Binary files a/ocaml/boot/ocamllex and b/ocaml/boot/ocamllex differ
diff --git a/ocaml/lambda/lambda.ml b/ocaml/lambda/lambda.ml
index 48f210df8a7..6a7932f739a 100644
--- a/ocaml/lambda/lambda.ml
+++ b/ocaml/lambda/lambda.ml
@@ -260,12 +260,14 @@ and value_kind =
       non_consts : (int * value_kind list) list;
     }
   | Parrayval of array_kind
+  | Pboxedvectorval of boxed_vector
 
 and layout =
   | Ptop
   | Pvalue of value_kind
   | Punboxed_float
   | Punboxed_int of boxed_integer
+  | Punboxed_vector of boxed_vector
   | Pbottom
 
 and block_shape =
@@ -289,6 +291,9 @@ and array_set_kind =
 and boxed_integer = Primitive.boxed_integer =
     Pnativeint | Pint32 | Pint64
 
+and boxed_vector = Primitive.boxed_vector =
+  | Pvec128
+
 and bigarray_kind =
     Pbigarray_unknown
   | Pbigarray_float32 | Pbigarray_float64
@@ -310,6 +315,8 @@ and raise_kind =
 
 let equal_boxed_integer = Primitive.equal_boxed_integer
 
+let equal_boxed_vector = Primitive.equal_boxed_vector
+
 let equal_primitive =
   (* Should be implemented like [equal_value_kind] of [equal_boxed_integer],
      i.e. by matching over the various constructors but the type has more
@@ -321,6 +328,8 @@ let rec equal_value_kind x y =
   | Pgenval, Pgenval -> true
   | Pfloatval, Pfloatval -> true
   | Pboxedintval bi1, Pboxedintval bi2 -> equal_boxed_integer bi1 bi2
+  | Pboxedvectorval bi1, Pboxedvectorval bi2 ->
+    equal_boxed_vector bi1 bi2
   | Pintval, Pintval -> true
   | Parrayval elt_kind1, Parrayval elt_kind2 -> elt_kind1 = elt_kind2
   | Pvariant { consts = consts1; non_consts = non_consts1; },
@@ -337,7 +346,7 @@ let rec equal_value_kind x y =
              && List.for_all2 equal_value_kind fields1 fields2)
            non_consts1 non_consts2
   | (Pgenval | Pfloatval | Pboxedintval _ | Pintval | Pvariant _
-      | Parrayval _), _ -> false
+      | Parrayval _ | Pboxedvectorval _), _ -> false
 
 let equal_layout x y =
   match x, y with
@@ -354,9 +363,10 @@ let compatible_layout x y =
   | Punboxed_float, Punboxed_float -> true
   | Punboxed_int bi1, Punboxed_int bi2 ->
       equal_boxed_integer bi1 bi2
+  | Punboxed_vector bi1, Punboxed_vector bi2 -> equal_boxed_vector bi1 bi2
   | Ptop, Ptop -> true
   | Ptop, _ | _, Ptop -> false
-  | (Pvalue _ | Punboxed_float | Punboxed_int _), _ -> false
+  | (Pvalue _ | Punboxed_float | Punboxed_int _ | Punboxed_vector _), _ -> false
 
 let must_be_value layout =
   match layout with
@@ -640,6 +650,8 @@ let layout_functor = Pvalue Pgenval
 let layout_boxed_float = Pvalue Pfloatval
 let layout_string = Pvalue Pgenval
 let layout_boxedint bi = Pvalue (Pboxedintval bi)
+
+let layout_boxed_vector vi = Pvalue (Pboxedvectorval vi)
 let layout_lazy = Pvalue Pgenval
 let layout_lazy_contents = Pvalue Pgenval
 let layout_any_value = Pvalue Pgenval
@@ -1442,6 +1454,7 @@ let primitive_result_layout (p : primitive) =
   | Pbox_float _ -> layout_boxed_float
   | Punbox_float -> Punboxed_float
   | Pccall { prim_native_repr_res = _, Untagged_int; _} -> layout_int
+  | Pccall { prim_native_repr_res = _, Unboxed_vector v; _} -> layout_boxed_vector v
   | Pccall { prim_native_repr_res = _, Unboxed_float; _} -> layout_boxed_float
   | Pccall { prim_native_repr_res = _, Same_as_ocaml_repr s; _} ->
       begin match s with
diff --git a/ocaml/lambda/lambda.mli b/ocaml/lambda/lambda.mli
index 09ea1eb336d..b6236bfb775 100644
--- a/ocaml/lambda/lambda.mli
+++ b/ocaml/lambda/lambda.mli
@@ -235,6 +235,7 @@ and value_kind =
           expected to be significant. *)
     }
   | Parrayval of array_kind
+  | Pboxedvectorval of boxed_vector
 
 (* Because we check for and error on void in the translation to lambda, we don't
    need a constructor for it here. *)
@@ -243,6 +244,7 @@ and layout =
   | Pvalue of value_kind
   | Punboxed_float
   | Punboxed_int of boxed_integer
+  | Punboxed_vector of boxed_vector
   | Pbottom
 
 and block_shape =
@@ -251,6 +253,9 @@ and block_shape =
 and boxed_integer = Primitive.boxed_integer =
     Pnativeint | Pint32 | Pint64
 
+and boxed_vector = Primitive.boxed_vector =
+  | Pvec128
+
 and bigarray_kind =
     Pbigarray_unknown
   | Pbigarray_float32 | Pbigarray_float64
@@ -280,6 +285,8 @@ val compatible_layout : layout -> layout -> bool
 
 val equal_boxed_integer : boxed_integer -> boxed_integer -> bool
 
+val equal_boxed_vector : boxed_vector -> boxed_vector -> bool
+
 val must_be_value : layout -> value_kind
 
 type structured_constant =
diff --git a/ocaml/lambda/printlambda.ml b/ocaml/lambda/printlambda.ml
index 9f6b1aaac15..427120e9415 100644
--- a/ocaml/lambda/printlambda.ml
+++ b/ocaml/lambda/printlambda.ml
@@ -85,6 +85,9 @@ let boxed_integer_name = function
   | Pint32 -> "int32"
   | Pint64 -> "int64"
 
+let boxed_vector_name = function
+  | Pvec128 -> "vec128"
+
 let variant_kind print_contents ppf ~consts ~non_consts =
   fprintf ppf "@[<hov 1>[(consts (%a))@ (non_consts (%a))]@]"
     (Format.pp_print_list ~pp_sep:Format.pp_print_space Format.pp_print_int)
@@ -105,6 +108,7 @@ let rec value_kind ppf = function
   | Pfloatval -> fprintf ppf "[float]"
   | Parrayval elt_kind -> fprintf ppf "[%sarray]" (array_kind elt_kind)
   | Pboxedintval bi -> fprintf ppf "[%s]" (boxed_integer_name bi)
+  | Pboxedvectorval bv -> fprintf ppf "[%s]" (boxed_vector_name bv)
   | Pvariant { consts; non_consts; } ->
     variant_kind value_kind' ppf ~consts ~non_consts
 
@@ -114,6 +118,7 @@ and value_kind' ppf = function
   | Pfloatval -> fprintf ppf "[float]"
   | Parrayval elt_kind -> fprintf ppf "[%sarray]" (array_kind elt_kind)
   | Pboxedintval bi -> fprintf ppf "[%s]" (boxed_integer_name bi)
+  | Pboxedvectorval bv -> fprintf ppf "[%s]" (boxed_vector_name bv)
   | Pvariant { consts; non_consts; } ->
     variant_kind value_kind' ppf ~consts ~non_consts
 
@@ -124,6 +129,7 @@ let layout ppf layout =
   | Pbottom -> fprintf ppf "[bottom]"
   | Punboxed_float -> fprintf ppf "[unboxed_float]"
   | Punboxed_int bi -> fprintf ppf "[unboxed_%s]" (boxed_integer_name bi)
+  | Punboxed_vector bi -> fprintf ppf "[unboxed_%s]" (boxed_vector_name bi)
 
 let return_kind ppf (mode, kind) =
   let smode = alloc_mode mode in
@@ -135,10 +141,13 @@ let return_kind ppf (mode, kind) =
   | Pvalue (Parrayval elt_kind) ->
      fprintf ppf ": %s%sarray@ " smode (array_kind elt_kind)
   | Pvalue (Pboxedintval bi) -> fprintf ppf ": %s%s@ " smode (boxed_integer_name bi)
+  | Pvalue (Pboxedvectorval bv) ->
+    fprintf ppf ": %s%s@ " smode (boxed_vector_name bv)
   | Pvalue (Pvariant { consts; non_consts; }) ->
     variant_kind value_kind' ppf ~consts ~non_consts
   | Punboxed_float -> fprintf ppf ": unboxed_float@ "
   | Punboxed_int bi -> fprintf ppf ": unboxed_%s@ " (boxed_integer_name bi)
+  | Punboxed_vector bi -> fprintf ppf ": unboxed_%s@ " (boxed_vector_name bi)
   | Ptop -> fprintf ppf ": top@ "
   | Pbottom -> fprintf ppf ": bottom@ "
 
@@ -148,6 +157,7 @@ let field_kind ppf = function
   | Pfloatval -> pp_print_string ppf "float"
   | Parrayval elt_kind -> fprintf ppf "%s-array" (array_kind elt_kind)
   | Pboxedintval bi -> pp_print_string ppf (boxed_integer_name bi)
+  | Pboxedvectorval bv -> pp_print_string ppf (boxed_vector_name bv)
   | Pvariant { consts; non_consts; } ->
     fprintf ppf "@[<hov 1>[(consts (%a))@ (non_consts (%a))]@]"
       (Format.pp_print_list ~pp_sep:Format.pp_print_space Format.pp_print_int)
diff --git a/ocaml/middle_end/clambda_primitives.ml b/ocaml/middle_end/clambda_primitives.ml
index 92725a7a771..b2f2b908ead 100644
--- a/ocaml/middle_end/clambda_primitives.ml
+++ b/ocaml/middle_end/clambda_primitives.ml
@@ -158,18 +158,23 @@ and value_kind = Lambda.value_kind =
       non_consts : (int * value_kind list) list;
     }
   | Parrayval of array_kind
+  | Pboxedvectorval of boxed_vector 
 
 and layout = Lambda.layout =
   | Ptop
   | Pvalue of value_kind
   | Punboxed_float
   | Punboxed_int of boxed_integer
+  | Punboxed_vector of boxed_vector
   | Pbottom
 
 and block_shape = Lambda.block_shape
 and boxed_integer = Primitive.boxed_integer =
     Pnativeint | Pint32 | Pint64
 
+and boxed_vector = Primitive.boxed_vector = 
+  | Pvec128
+
 and bigarray_kind = Lambda.bigarray_kind =
     Pbigarray_unknown
   | Pbigarray_float32 | Pbigarray_float64
diff --git a/ocaml/middle_end/clambda_primitives.mli b/ocaml/middle_end/clambda_primitives.mli
index b65e674ee76..06bd54fd94c 100644
--- a/ocaml/middle_end/clambda_primitives.mli
+++ b/ocaml/middle_end/clambda_primitives.mli
@@ -161,18 +161,23 @@ and value_kind = Lambda.value_kind =
       non_consts : (int * value_kind list) list;
     }
   | Parrayval of array_kind
+  | Pboxedvectorval of boxed_vector 
 
 and layout = Lambda.layout =
   | Ptop
   | Pvalue of value_kind
   | Punboxed_float
   | Punboxed_int of boxed_integer
+  | Punboxed_vector of boxed_vector
   | Pbottom
 
 and block_shape = Lambda.block_shape
 and boxed_integer = Primitive.boxed_integer =
     Pnativeint | Pint32 | Pint64
 
+and boxed_vector = Primitive.boxed_vector = 
+  | Pvec128
+
 and bigarray_kind = Lambda.bigarray_kind =
     Pbigarray_unknown
   | Pbigarray_float32 | Pbigarray_float64
diff --git a/ocaml/middle_end/closure/closure.ml b/ocaml/middle_end/closure/closure.ml
index fd9b3615375..7d009724b70 100644
--- a/ocaml/middle_end/closure/closure.ml
+++ b/ocaml/middle_end/closure/closure.ml
@@ -63,8 +63,10 @@ let is_gc_ignorable kind =
   | Pbottom -> Misc.fatal_error "[Pbottom] should not be stored in a closure."
   | Punboxed_float -> true
   | Punboxed_int _ -> true
+  | Punboxed_vector _ -> true
   | Pvalue Pintval -> true
-  | Pvalue (Pgenval | Pfloatval | Pboxedintval _ | Pvariant _ | Parrayval _) -> false
+  | Pvalue (Pgenval | Pfloatval | Pboxedintval _ | Pvariant _ | Parrayval _ |
+            Pboxedvectorval _) -> false
 
 let split_closure_fv kinds fv =
   List.fold_right (fun id (not_scanned, scanned) ->
diff --git a/ocaml/middle_end/flambda/closure_offsets.ml b/ocaml/middle_end/flambda/closure_offsets.ml
index cfb1791786d..d771cdd3e54 100644
--- a/ocaml/middle_end/flambda/closure_offsets.ml
+++ b/ocaml/middle_end/flambda/closure_offsets.ml
@@ -79,6 +79,7 @@ let add_closure_offsets
              and not stored in a closure."
         | Punboxed_float -> true
         | Punboxed_int _ -> true
+        | Punboxed_vector _ -> true
         | Pvalue Pintval -> true
         | Pvalue _ -> false)
       free_vars
diff --git a/ocaml/middle_end/flambda/flambda_to_clambda.ml b/ocaml/middle_end/flambda/flambda_to_clambda.ml
index 7a337694c12..10abe813aba 100644
--- a/ocaml/middle_end/flambda/flambda_to_clambda.ml
+++ b/ocaml/middle_end/flambda/flambda_to_clambda.ml
@@ -710,6 +710,7 @@ and to_clambda_set_of_closures t env
              and not stored in a closure."
         | Punboxed_float -> true
         | Punboxed_int _ -> true
+        | Punboxed_vector _ -> true
         | Pvalue Pintval -> true
         | Pvalue _ -> false)
       free_vars
diff --git a/ocaml/middle_end/printclambda.ml b/ocaml/middle_end/printclambda.ml
index 46056115b85..0919908b4eb 100644
--- a/ocaml/middle_end/printclambda.ml
+++ b/ocaml/middle_end/printclambda.ml
@@ -37,6 +37,7 @@ let rec value_kind0 ppf kind =
   | Pboxedintval Pnativeint -> Format.pp_print_string ppf ":nativeint"
   | Pboxedintval Pint32 -> Format.pp_print_string ppf ":int32"
   | Pboxedintval Pint64 -> Format.pp_print_string ppf ":int64"
+  | Pboxedvectorval Pvec128 -> Format.pp_print_string ppf ":vec128"
   | Pvariant { consts; non_consts } ->
     Format.fprintf ppf "@[<hov 1>[(consts (%a))@ (non_consts (%a))]@]"
       (Format.pp_print_list ~pp_sep:Format.pp_print_space Format.pp_print_int)
@@ -60,6 +61,7 @@ let layout (layout : Lambda.layout) =
   | Punboxed_int Pint32 -> ":unboxed_int32"
   | Punboxed_int Pint64 -> ":unboxed_int64"
   | Punboxed_int Pnativeint -> ":unboxed_nativeint"
+  | Punboxed_vector Pvec128 -> ":unboxed_vec128"
 
 let rec structured_constant ppf = function
   | Uconst_float x -> fprintf ppf "%F" x
diff --git a/ocaml/runtime/amd64.S b/ocaml/runtime/amd64.S
index 15b8e8734dd..aeac005207c 100644
--- a/ocaml/runtime/amd64.S
+++ b/ocaml/runtime/amd64.S
@@ -347,23 +347,23 @@ LBL(caml_call_gc):
     /* Save young_ptr */
         movq    %r15, Caml_state(young_ptr)
     /* Save floating-point registers */
-        subq    $(16*8), %rsp; CFI_ADJUST (16*8);
-        movsd   %xmm0, 0*8(%rsp)
-        movsd   %xmm1, 1*8(%rsp)
-        movsd   %xmm2, 2*8(%rsp)
-        movsd   %xmm3, 3*8(%rsp)
-        movsd   %xmm4, 4*8(%rsp)
-        movsd   %xmm5, 5*8(%rsp)
-        movsd   %xmm6, 6*8(%rsp)
-        movsd   %xmm7, 7*8(%rsp)
-        movsd   %xmm8, 8*8(%rsp)
-        movsd   %xmm9, 9*8(%rsp)
-        movsd   %xmm10, 10*8(%rsp)
-        movsd   %xmm11, 11*8(%rsp)
-        movsd   %xmm12, 12*8(%rsp)
-        movsd   %xmm13, 13*8(%rsp)
-        movsd   %xmm14, 14*8(%rsp)
-        movsd   %xmm15, 15*8(%rsp)
+        subq    $(16*16), %rsp; CFI_ADJUST (16*16);
+        movupd  %xmm0, 0*16(%rsp)
+        movupd  %xmm1, 1*16(%rsp)
+        movupd  %xmm2, 2*16(%rsp)
+        movupd  %xmm3, 3*16(%rsp)
+        movupd  %xmm4, 4*16(%rsp)
+        movupd  %xmm5, 5*16(%rsp)
+        movupd  %xmm6, 6*16(%rsp)
+        movupd  %xmm7, 7*16(%rsp)
+        movupd  %xmm8, 8*16(%rsp)
+        movupd  %xmm9, 9*16(%rsp)
+        movupd  %xmm10, 10*16(%rsp)
+        movupd  %xmm11, 11*16(%rsp)
+        movupd  %xmm12, 12*16(%rsp)
+        movupd  %xmm13, 13*16(%rsp)
+        movupd  %xmm14, 14*16(%rsp)
+        movupd  %xmm15, 15*16(%rsp)
     /* Call the garbage collector */
         PREPARE_FOR_C_CALL
         call    GCALL(caml_garbage_collection)
@@ -371,23 +371,23 @@ LBL(caml_call_gc):
     /* Restore young_ptr */
         movq    Caml_state(young_ptr), %r15
     /* Restore all regs used by the code generator */
-        movsd   0*8(%rsp), %xmm0
-        movsd   1*8(%rsp), %xmm1
-        movsd   2*8(%rsp), %xmm2
-        movsd   3*8(%rsp), %xmm3
-        movsd   4*8(%rsp), %xmm4
-        movsd   5*8(%rsp), %xmm5
-        movsd   6*8(%rsp), %xmm6
-        movsd   7*8(%rsp), %xmm7
-        movsd   8*8(%rsp), %xmm8
-        movsd   9*8(%rsp), %xmm9
-        movsd   10*8(%rsp), %xmm10
-        movsd   11*8(%rsp), %xmm11
-        movsd   12*8(%rsp), %xmm12
-        movsd   13*8(%rsp), %xmm13
-        movsd   14*8(%rsp), %xmm14
-        movsd   15*8(%rsp), %xmm15
-        addq    $(16*8), %rsp; CFI_ADJUST(-16*8)
+        movupd  0*16(%rsp), %xmm0
+        movupd  1*16(%rsp), %xmm1
+        movupd  2*16(%rsp), %xmm2
+        movupd  3*16(%rsp), %xmm3
+        movupd  4*16(%rsp), %xmm4
+        movupd  5*16(%rsp), %xmm5
+        movupd  6*16(%rsp), %xmm6
+        movupd  7*16(%rsp), %xmm7
+        movupd  8*16(%rsp), %xmm8
+        movupd  9*16(%rsp), %xmm9
+        movupd  10*16(%rsp), %xmm10
+        movupd  11*16(%rsp), %xmm11
+        movupd  12*16(%rsp), %xmm12
+        movupd  13*16(%rsp), %xmm13
+        movupd  14*16(%rsp), %xmm14
+        movupd  15*16(%rsp), %xmm15
+        addq    $(16*16), %rsp; CFI_ADJUST(-16*16)
         popq    %rax; CFI_ADJUST(-8)
         popq    %rbx; CFI_ADJUST(-8)
         popq    %rdi; CFI_ADJUST(-8)
@@ -476,23 +476,23 @@ FUNCTION(G(caml_call_local_realloc))
     /* Save young_ptr */
         movq    %r15, Caml_state(young_ptr)
     /* Save floating-point registers */
-        subq    $(16*8), %rsp; CFI_ADJUST (16*8);
-        movsd   %xmm0, 0*8(%rsp)
-        movsd   %xmm1, 1*8(%rsp)
-        movsd   %xmm2, 2*8(%rsp)
-        movsd   %xmm3, 3*8(%rsp)
-        movsd   %xmm4, 4*8(%rsp)
-        movsd   %xmm5, 5*8(%rsp)
-        movsd   %xmm6, 6*8(%rsp)
-        movsd   %xmm7, 7*8(%rsp)
-        movsd   %xmm8, 8*8(%rsp)
-        movsd   %xmm9, 9*8(%rsp)
-        movsd   %xmm10, 10*8(%rsp)
-        movsd   %xmm11, 11*8(%rsp)
-        movsd   %xmm12, 12*8(%rsp)
-        movsd   %xmm13, 13*8(%rsp)
-        movsd   %xmm14, 14*8(%rsp)
-        movsd   %xmm15, 15*8(%rsp)
+        subq    $(16*16), %rsp; CFI_ADJUST (16*16);
+        movupd  %xmm0, 0*16(%rsp)
+        movupd  %xmm1, 1*16(%rsp)
+        movupd  %xmm2, 2*16(%rsp)
+        movupd  %xmm3, 3*16(%rsp)
+        movupd  %xmm4, 4*16(%rsp)
+        movupd  %xmm5, 5*16(%rsp)
+        movupd  %xmm6, 6*16(%rsp)
+        movupd  %xmm7, 7*16(%rsp)
+        movupd  %xmm8, 8*16(%rsp)
+        movupd  %xmm9, 9*16(%rsp)
+        movupd  %xmm10, 10*16(%rsp)
+        movupd  %xmm11, 11*16(%rsp)
+        movupd  %xmm12, 12*16(%rsp)
+        movupd  %xmm13, 13*16(%rsp)
+        movupd  %xmm14, 14*16(%rsp)
+        movupd  %xmm15, 15*16(%rsp)
     /* Call the garbage collector */
         PREPARE_FOR_C_CALL
         call    GCALL(caml_local_realloc)
@@ -500,23 +500,23 @@ FUNCTION(G(caml_call_local_realloc))
     /* Restore young_ptr */
         movq    Caml_state(young_ptr), %r15
     /* Restore all regs used by the code generator */
-        movsd   0*8(%rsp), %xmm0
-        movsd   1*8(%rsp), %xmm1
-        movsd   2*8(%rsp), %xmm2
-        movsd   3*8(%rsp), %xmm3
-        movsd   4*8(%rsp), %xmm4
-        movsd   5*8(%rsp), %xmm5
-        movsd   6*8(%rsp), %xmm6
-        movsd   7*8(%rsp), %xmm7
-        movsd   8*8(%rsp), %xmm8
-        movsd   9*8(%rsp), %xmm9
-        movsd   10*8(%rsp), %xmm10
-        movsd   11*8(%rsp), %xmm11
-        movsd   12*8(%rsp), %xmm12
-        movsd   13*8(%rsp), %xmm13
-        movsd   14*8(%rsp), %xmm14
-        movsd   15*8(%rsp), %xmm15
-        addq    $(16*8), %rsp; CFI_ADJUST(-16*8)
+        movupd  0*16(%rsp), %xmm0
+        movupd  1*16(%rsp), %xmm1
+        movupd  2*16(%rsp), %xmm2
+        movupd  3*16(%rsp), %xmm3
+        movupd  4*16(%rsp), %xmm4
+        movupd  5*16(%rsp), %xmm5
+        movupd  6*16(%rsp), %xmm6
+        movupd  7*16(%rsp), %xmm7
+        movupd  8*16(%rsp), %xmm8
+        movupd  9*16(%rsp), %xmm9
+        movupd  10*16(%rsp), %xmm10
+        movupd  11*16(%rsp), %xmm11
+        movupd  12*16(%rsp), %xmm12
+        movupd  13*16(%rsp), %xmm13
+        movupd  14*16(%rsp), %xmm14
+        movupd  15*16(%rsp), %xmm15
+        addq    $(16*16), %rsp; CFI_ADJUST(-16*16)
         popq    %rax; CFI_ADJUST(-8)
         popq    %rbx; CFI_ADJUST(-8)
         popq    %rdi; CFI_ADJUST(-8)
diff --git a/ocaml/runtime/amd64nt.asm b/ocaml/runtime/amd64nt.asm
index 9185d06787d..3a90239cb47 100644
--- a/ocaml/runtime/amd64nt.asm
+++ b/ocaml/runtime/amd64nt.asm
@@ -69,45 +69,45 @@ caml_call_gc:
         push    rax
         Store_gc_regs rsp
     ; Save floating-point registers
-        sub     rsp, 16*8
-        movsd   QWORD PTR [rsp + 0*8], xmm0
-        movsd   QWORD PTR [rsp + 1*8], xmm1
-        movsd   QWORD PTR [rsp + 2*8], xmm2
-        movsd   QWORD PTR [rsp + 3*8], xmm3
-        movsd   QWORD PTR [rsp + 4*8], xmm4
-        movsd   QWORD PTR [rsp + 5*8], xmm5
-        movsd   QWORD PTR [rsp + 6*8], xmm6
-        movsd   QWORD PTR [rsp + 7*8], xmm7
-        movsd   QWORD PTR [rsp + 8*8], xmm8
-        movsd   QWORD PTR [rsp + 9*8], xmm9
-        movsd   QWORD PTR [rsp + 10*8], xmm10
-        movsd   QWORD PTR [rsp + 11*8], xmm11
-        movsd   QWORD PTR [rsp + 12*8], xmm12
-        movsd   QWORD PTR [rsp + 13*8], xmm13
-        movsd   QWORD PTR [rsp + 14*8], xmm14
-        movsd   QWORD PTR [rsp + 15*8], xmm15
+        sub     rsp, 16*16
+        movupd  QWORD PTR [rsp + 0*16], xmm0
+        movupd  QWORD PTR [rsp + 1*16], xmm1
+        movupd  QWORD PTR [rsp + 2*16], xmm2
+        movupd  QWORD PTR [rsp + 3*16], xmm3
+        movupd  QWORD PTR [rsp + 4*16], xmm4
+        movupd  QWORD PTR [rsp + 5*16], xmm5
+        movupd  QWORD PTR [rsp + 6*16], xmm6
+        movupd  QWORD PTR [rsp + 7*16], xmm7
+        movupd  QWORD PTR [rsp + 8*16], xmm8
+        movupd  QWORD PTR [rsp + 9*16], xmm9
+        movupd  QWORD PTR [rsp + 10*16], xmm10
+        movupd  QWORD PTR [rsp + 11*16], xmm11
+        movupd  QWORD PTR [rsp + 12*16], xmm12
+        movupd  QWORD PTR [rsp + 13*16], xmm13
+        movupd  QWORD PTR [rsp + 14*16], xmm14
+        movupd  QWORD PTR [rsp + 15*16], xmm15
     ; Call the garbage collector
         sub rsp, 32      ; PR#5008: bottom 32 bytes are reserved for callee
         call caml_garbage_collection
         add rsp, 32      ; PR#5008
     ; Restore all regs used by the code generator
-        movsd   xmm0, QWORD PTR [rsp + 0*8]
-        movsd   xmm1, QWORD PTR [rsp + 1*8]
-        movsd   xmm2, QWORD PTR [rsp + 2*8]
-        movsd   xmm3, QWORD PTR [rsp + 3*8]
-        movsd   xmm4, QWORD PTR [rsp + 4*8]
-        movsd   xmm5, QWORD PTR [rsp + 5*8]
-        movsd   xmm6, QWORD PTR [rsp + 6*8]
-        movsd   xmm7, QWORD PTR [rsp + 7*8]
-        movsd   xmm8, QWORD PTR [rsp + 8*8]
-        movsd   xmm9, QWORD PTR [rsp + 9*8]
-        movsd   xmm10, QWORD PTR [rsp + 10*8]
-        movsd   xmm11, QWORD PTR [rsp + 11*8]
-        movsd   xmm12, QWORD PTR [rsp + 12*8]
-        movsd   xmm13, QWORD PTR [rsp + 13*8]
-        movsd   xmm14, QWORD PTR [rsp + 14*8]
-        movsd   xmm15, QWORD PTR [rsp + 15*8]
-        add     rsp, 16*8
+        movupd  xmm0, QWORD PTR [rsp + 0*16]
+        movupd  xmm1, QWORD PTR [rsp + 1*16]
+        movupd  xmm2, QWORD PTR [rsp + 2*16]
+        movupd  xmm3, QWORD PTR [rsp + 3*16]
+        movupd  xmm4, QWORD PTR [rsp + 4*16]
+        movupd  xmm5, QWORD PTR [rsp + 5*16]
+        movupd  xmm6, QWORD PTR [rsp + 6*16]
+        movupd  xmm7, QWORD PTR [rsp + 7*16]
+        movupd  xmm8, QWORD PTR [rsp + 8*16]
+        movupd  xmm9, QWORD PTR [rsp + 9*16]
+        movupd  xmm10, QWORD PTR [rsp + 10*16]
+        movupd  xmm11, QWORD PTR [rsp + 11*16]
+        movupd  xmm12, QWORD PTR [rsp + 12*16]
+        movupd  xmm13, QWORD PTR [rsp + 13*16]
+        movupd  xmm14, QWORD PTR [rsp + 14*16]
+        movupd  xmm15, QWORD PTR [rsp + 15*16]
+        add     rsp, 16*16
         pop     rax
         pop     rbx
         pop     rdi
diff --git a/ocaml/testsuite/tests/typing-unboxed/test.ml b/ocaml/testsuite/tests/typing-unboxed/test.ml
index 6841b4957d5..7f8ad66716c 100644
--- a/ocaml/testsuite/tests/typing-unboxed/test.ml
+++ b/ocaml/testsuite/tests/typing-unboxed/test.ml
@@ -635,7 +635,7 @@ Line 1, characters 14-17:
 1 | external h : (int [@unboxed]) -> float = "h" "h_nat";;
                   ^^^
 Error: Don't know how to unbox this type.
-       Only float, int32, int64 and nativeint can be unboxed.
+       Only float, int32, int64, nativeint, and vec128 can be unboxed.
 |}]
 
 (* Bad: unboxing the function type *)
@@ -645,7 +645,7 @@ Line 1, characters 13-25:
 1 | external i : int -> float [@unboxed] = "i" "i_nat";;
                  ^^^^^^^^^^^^
 Error: Don't know how to unbox this type.
-       Only float, int32, int64 and nativeint can be unboxed.
+       Only float, int32, int64, nativeint, and vec128 can be unboxed.
 |}]
 
 (* Bad: unboxing a "deep" sub-type. *)
diff --git a/ocaml/typing/predef.ml b/ocaml/typing/predef.ml
index 192ae714a2e..a578cb921cd 100644
--- a/ocaml/typing/predef.ml
+++ b/ocaml/typing/predef.ml
@@ -44,6 +44,7 @@ and ident_option = ident_create "option"
 and ident_nativeint = ident_create "nativeint"
 and ident_int32 = ident_create "int32"
 and ident_int64 = ident_create "int64"
+and ident_vec128 = ident_create "vec128"
 and ident_lazy_t = ident_create "lazy_t"
 and ident_string = ident_create "string"
 and ident_extension_constructor = ident_create "extension_constructor"
@@ -66,6 +67,7 @@ and path_option = Pident ident_option
 and path_nativeint = Pident ident_nativeint
 and path_int32 = Pident ident_int32
 and path_int64 = Pident ident_int64
+and path_vec128 = Pident ident_vec128
 and path_lazy_t = Pident ident_lazy_t
 and path_string = Pident ident_string
 and path_extension_constructor = Pident ident_extension_constructor
@@ -86,6 +88,7 @@ and type_option t = newgenty (Tconstr(path_option, [t], ref Mnil))
 and type_nativeint = newgenty (Tconstr(path_nativeint, [], ref Mnil))
 and type_int32 = newgenty (Tconstr(path_int32, [], ref Mnil))
 and type_int64 = newgenty (Tconstr(path_int64, [], ref Mnil))
+and type_vec128 = newgenty (Tconstr(path_vec128, [], ref Mnil))
 and type_lazy_t t = newgenty (Tconstr(path_lazy_t, [t], ref Mnil))
 and type_string = newgenty (Tconstr(path_string, [], ref Mnil))
 and type_extension_constructor =
@@ -261,6 +264,7 @@ let common_initial_env add_type add_extension empty_env =
   |> add_type ident_unit
        ~kind:(variant [cstr ident_void []] [| [| |] |])
        ~layout:(Layout.immediate ~why:Enumeration)
+  |> add_type ident_vec128
   (* Predefined exceptions - alphabetical order *)
   |> add_extension ident_assert_failure
        [newgenty (Ttuple[type_string; type_int; type_int])]
diff --git a/ocaml/typing/predef.mli b/ocaml/typing/predef.mli
index 77656a3227c..ec53f19b386 100644
--- a/ocaml/typing/predef.mli
+++ b/ocaml/typing/predef.mli
@@ -33,6 +33,7 @@ val type_option: type_expr -> type_expr
 val type_nativeint: type_expr
 val type_int32: type_expr
 val type_int64: type_expr
+val type_vec128: type_expr
 val type_lazy_t: type_expr -> type_expr
 val type_extension_constructor:type_expr
 val type_floatarray:type_expr
@@ -52,6 +53,7 @@ val path_option: Path.t
 val path_nativeint: Path.t
 val path_int32: Path.t
 val path_int64: Path.t
+val path_vec128: Path.t
 val path_lazy_t: Path.t
 val path_extension_constructor: Path.t
 val path_floatarray: Path.t
diff --git a/ocaml/typing/primitive.ml b/ocaml/typing/primitive.ml
index 34804aaee3b..a3661a17289 100644
--- a/ocaml/typing/primitive.ml
+++ b/ocaml/typing/primitive.ml
@@ -21,9 +21,12 @@ open Layouts
 
 type boxed_integer = Pnativeint | Pint32 | Pint64
 
+type boxed_vector = Pvec128 
+
 type native_repr =
   | Same_as_ocaml_repr of Layouts.Sort.const
   | Unboxed_float
+  | Unboxed_vector of boxed_vector
   | Unboxed_integer of boxed_integer
   | Untagged_int
 
@@ -58,6 +61,7 @@ exception Error of Location.t * error
 let is_ocaml_repr = function
   | _, Same_as_ocaml_repr _ -> true
   | _, Unboxed_float
+  | _, Unboxed_vector _
   | _, Unboxed_integer _
   | _, Untagged_int -> false
 
@@ -65,12 +69,14 @@ let is_unboxed = function
   | _, Same_as_ocaml_repr _
   | _, Untagged_int -> false
   | _, Unboxed_float
+  | _, Unboxed_vector _
   | _, Unboxed_integer _ -> true
 
 let is_untagged = function
   | _, Untagged_int -> true
   | _, Same_as_ocaml_repr _
   | _, Unboxed_float
+  | _, Unboxed_vector _
   | _, Unboxed_integer _ -> false
 
 let rec make_native_repr_args arity x =
@@ -257,6 +263,7 @@ let print p osig_val_decl =
     (match repr with
      | Same_as_ocaml_repr _ -> []
      | Unboxed_float
+     | Unboxed_vector _
      | Unboxed_integer _ -> if all_unboxed then [] else [oattr_unboxed]
      | Untagged_int -> if all_untagged then [] else [oattr_untagged])
   in
@@ -286,20 +293,27 @@ let equal_boxed_integer bi1 bi2 =
   | (Pnativeint | Pint32 | Pint64), _ ->
     false
 
+let equal_boxed_vector bi1 bi2 = 
+  match bi1, bi2 with 
+  | Pvec128, Pvec128 -> true 
+
 let equal_native_repr nr1 nr2 =
   match nr1, nr2 with
   | Same_as_ocaml_repr s1, Same_as_ocaml_repr s2 -> Sort.equal_const s1 s2
   | Same_as_ocaml_repr _,
-    (Unboxed_float | Unboxed_integer _ | Untagged_int) -> false
+    (Unboxed_float | Unboxed_integer _ | Untagged_int | Unboxed_vector _) -> false
   | Unboxed_float, Unboxed_float -> true
   | Unboxed_float,
-    (Same_as_ocaml_repr _ | Unboxed_integer _ | Untagged_int) -> false
+    (Same_as_ocaml_repr _ | Unboxed_integer _ | Untagged_int | Unboxed_vector _) -> false
+  | Unboxed_vector vi1, Unboxed_vector vi2 -> equal_boxed_vector vi1 vi2
+  | Unboxed_vector _,
+    (Same_as_ocaml_repr _ | Unboxed_float | Untagged_int | Unboxed_integer _) -> false
   | Unboxed_integer bi1, Unboxed_integer bi2 -> equal_boxed_integer bi1 bi2
   | Unboxed_integer _,
-    (Same_as_ocaml_repr _ | Unboxed_float | Untagged_int) -> false
+    (Same_as_ocaml_repr _ | Unboxed_float | Untagged_int | Unboxed_vector _) -> false
   | Untagged_int, Untagged_int -> true
   | Untagged_int,
-    (Same_as_ocaml_repr _ | Unboxed_float | Unboxed_integer _) -> false
+    (Same_as_ocaml_repr _ | Unboxed_float | Unboxed_integer _ | Unboxed_vector _) -> false
 
 let equal_effects ef1 ef2 =
   match ef1, ef2 with
@@ -323,7 +337,7 @@ let native_name_is_external p =
 
 let sort_of_native_repr = function
   | Same_as_ocaml_repr s -> s
-  | (Unboxed_float | Unboxed_integer _ | Untagged_int) -> Sort.Value
+  | (Unboxed_float | Unboxed_integer _ | Untagged_int | Unboxed_vector _) -> Sort.Value
 
 let report_error ppf err =
   match err with
diff --git a/ocaml/typing/primitive.mli b/ocaml/typing/primitive.mli
index a52e550c5fe..98c15b2158d 100644
--- a/ocaml/typing/primitive.mli
+++ b/ocaml/typing/primitive.mli
@@ -17,11 +17,14 @@
 
 type boxed_integer = Pnativeint | Pint32 | Pint64
 
+type boxed_vector = Pvec128
+
 (* Representation of arguments/result for the native code version
    of a primitive *)
 type native_repr =
   | Same_as_ocaml_repr of Layouts.Sort.const
   | Unboxed_float
+  | Unboxed_vector of boxed_vector
   | Unboxed_integer of boxed_integer
   | Untagged_int
 
@@ -88,6 +91,7 @@ val native_name: description -> string
 val byte_name: description -> string
 
 val equal_boxed_integer : boxed_integer -> boxed_integer -> bool
+val equal_boxed_vector : boxed_vector -> boxed_vector -> bool
 val equal_native_repr : native_repr -> native_repr -> bool
 val equal_effects : effects -> effects -> bool
 val equal_coeffects : coeffects -> coeffects -> bool
diff --git a/ocaml/typing/typedecl.ml b/ocaml/typing/typedecl.ml
index 48a5ad061c2..acf2f7ca6bc 100644
--- a/ocaml/typing/typedecl.ml
+++ b/ocaml/typing/typedecl.ml
@@ -1862,6 +1862,8 @@ let native_repr_of_type env kind ty =
     Some (Unboxed_integer Pint64)
   | Unboxed, Tconstr (path, _, _) when Path.same path Predef.path_nativeint ->
     Some (Unboxed_integer Pnativeint)
+  | Unboxed, Tconstr (path, _, _) when Path.same path Predef.path_vec128 -> 
+    Some (Unboxed_vector Pvec128)
   | _ ->
     None
 
@@ -2502,7 +2504,7 @@ let report_error ppf = function
       fprintf ppf "Too many [@@unboxed]/[@@untagged] attributes"
   | Cannot_unbox_or_untag_type Unboxed ->
       fprintf ppf "@[Don't know how to unbox this type.@ \
-                   Only float, int32, int64 and nativeint can be unboxed.@]"
+                   Only float, int32, int64, nativeint, and vec128 can be unboxed.@]"
   | Cannot_unbox_or_untag_type Untagged ->
       fprintf ppf "@[Don't know how to untag this type.@ \
                    Only int can be untagged.@]"
diff --git a/ocaml/typing/typeopt.ml b/ocaml/typing/typeopt.ml
index 48eb70aebee..eacf781818d 100644
--- a/ocaml/typing/typeopt.ml
+++ b/ocaml/typing/typeopt.ml
@@ -284,6 +284,8 @@ let rec value_kind env ~loc ~visited ~depth ~num_nodes_visited ty
     num_nodes_visited, (Pboxedintval Pint64)
   | Tconstr(p, _, _) when Path.same p Predef.path_nativeint ->
     num_nodes_visited, (Pboxedintval Pnativeint)
+  | Tconstr(p, _, _) when Path.same p Predef.path_vec128 ->
+    num_nodes_visited, (Pboxedvectorval Pvec128)
   | Tconstr(p, _, _)
     when (Path.same p Predef.path_array
           || Path.same p Predef.path_floatarray) ->
@@ -569,7 +571,9 @@ let layout_union l1 l2 =
   | Punboxed_float, Punboxed_float -> Punboxed_float
   | Punboxed_int bi1, Punboxed_int bi2 ->
       if equal_boxed_integer bi1 bi2 then l1 else Ptop
-  | (Ptop | Pvalue _ | Punboxed_float | Punboxed_int _), _ ->
+  | Punboxed_vector bi1, Punboxed_vector bi2 ->
+      if equal_boxed_vector bi1 bi2 then l1 else Ptop
+  | (Ptop | Pvalue _ | Punboxed_float | Punboxed_int _ | Punboxed_vector _), _ ->
       Ptop
 
 (* Error report *)
diff --git a/tests/backend/regalloc_validator/check_regalloc_validation.ml b/tests/backend/regalloc_validator/check_regalloc_validation.ml
index 5764c1d2a63..cb63fe81b90 100644
--- a/tests/backend/regalloc_validator/check_regalloc_validation.ml
+++ b/tests/backend/regalloc_validator/check_regalloc_validation.ml
@@ -90,7 +90,7 @@ module Cfg_desc = struct
     let cfg =
       Cfg.create ~fun_name:"foo" ~fun_args:(Array.copy fun_args) ~fun_dbg:[]
         ~fun_fast:false ~fun_contains_calls
-        ~fun_num_stack_slots:(Array.make Proc.num_register_classes 0)
+        ~fun_num_stack_slots:(Array.make Proc.num_stack_slot_classes 0)
     in
     List.iter
       (fun (block : Block.t) ->
@@ -120,10 +120,10 @@ module Cfg_desc = struct
          count. *)
       let update_stack_slots i =
         let update_slot (r : Reg.t) =
-          match r.loc, Proc.register_class r with
-          | Stack (Local idx), reg_class ->
-            cfg.fun_num_stack_slots.(reg_class)
-              <- max cfg.fun_num_stack_slots.(reg_class) (idx + 1)
+          match r.loc, Proc.stack_slot_class r.typ with
+          | Stack (Local idx), stack_class ->
+            cfg.fun_num_stack_slots.(stack_class)
+              <- max cfg.fun_num_stack_slots.(stack_class) (idx + 1)
           | _ -> ()
         in
         Array.iter update_slot i.arg;
@@ -168,7 +168,7 @@ let entry_label =
          Cfg.create ~fun_name:"foo"
            ~fun_args:[| Proc.phys_reg 0 |]
            ~fun_dbg:[] ~fun_fast:false ~fun_contains_calls:false
-           ~fun_num_stack_slots:(Array.make Proc.num_register_classes 0)
+           ~fun_num_stack_slots:(Array.make Proc.num_stack_slot_classes 0)
        in
        Label.Tbl.add cfg.Cfg.blocks (Cfg.entry_label cfg)
          { start = Cfg.entry_label cfg;
@@ -559,7 +559,7 @@ let () =
       cfg, cfg)
     ~exp_std:"fatal exception raised when validating description"
     ~exp_err:
-      ">> Fatal error: instruction 20 has a register (V/37) with an unknown \
+      ">> Fatal error: instruction 20 has a register (V/53) with an unknown \
        location"
 
 let () =
@@ -1024,7 +1024,7 @@ let test_loop ~loop_loc_first n =
     ~exp_std:
       "Validation failed: Bad equations at entry point, reason: Unsatisfiable \
        equations when removing result equations.\n\
-       Existing equation has to agree one 0 or 2 sides (cannot on exactly 1) \
+       Existing equation has to agree on 0 or 2 sides (cannot be exactly 1) \
        with the removed equation.\n\
        Existing equation R[%rdi]=%rbx.\n\
        Removed equation: R[%rbx]=%rbx.\n\
diff --git a/tests/simd/dune b/tests/simd/dune
new file mode 100644
index 00000000000..142f04bf0db
--- /dev/null
+++ b/tests/simd/dune
@@ -0,0 +1,22 @@
+(executable
+ (name simd)
+ (modules simd)
+ (foreign_stubs (language c) (names stubs) (flags -msse4.2))
+ (ocamlopt_flags (:standard -fsimd)))
+
+(rule
+ (enabled_if
+  (= %{context_name} "main"))
+ (target simd.output)
+ (deps simd.exe)
+ (action
+  (with-outputs-to
+   simd.output
+   (run ./simd.exe))))
+
+(rule
+ (alias runtest)
+ (enabled_if
+  (= %{context_name} "main"))
+ (action
+  (diff simd.expected simd.output)))
diff --git a/tests/simd/simd.expected b/tests/simd/simd.expected
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/simd/simd.ml b/tests/simd/simd.ml
new file mode 100644
index 00000000000..e59e824193c
--- /dev/null
+++ b/tests/simd/simd.ml
@@ -0,0 +1,300 @@
+open Stdlib
+
+external vec128_of_int64s : int64 -> int64 -> vec128 = "" "vec128_of_int64s" [@@noalloc] [@@unboxed]
+external vec128_low_int64 : vec128 -> int64 = "" "vec128_low_int64" [@@noalloc] [@@unboxed]
+external vec128_high_int64 : vec128 -> int64 = "" "vec128_high_int64" [@@noalloc] [@@unboxed]
+
+let eq l r = if l <> r then Printf.printf "%Ld <> %Ld\n" l r
+
+let[@inline never] check v l h =
+  let vl, vh = vec128_low_int64 v, vec128_high_int64 v in
+  eq vl l;
+  eq vh h
+;;
+
+let[@inline never] combine v0 v1 =
+  let l0, h0 = vec128_low_int64 v0, vec128_high_int64 v0 in
+  let l1, h1 = vec128_low_int64 v1, vec128_high_int64 v1 in
+  vec128_of_int64s (Int64.add l0 l1) (Int64.add h0 h1)
+;;
+
+let[@inline never] combine_with_floats v0 f0 v1 f1 =
+  let l0, h0 = vec128_low_int64 v0, vec128_high_int64 v0 in
+  let l1, h1 = vec128_low_int64 v1, vec128_high_int64 v1 in
+  let l, h = Int64.add l0 l1, Int64.add h0 h1 in
+  let l = Int64.add (Int64.of_float f0) l in
+  let h = Int64.add (Int64.of_float f1) h in
+  vec128_of_int64s l h
+;;
+
+(* Identity *)
+let () =
+  let v = vec128_of_int64s 1L 2L in
+  let v = Sys.opaque_identity v in
+  check v 1L 2L
+;;
+
+(* Identity fn *)
+let () =
+  let v = vec128_of_int64s 1L 2L in
+  let[@inline never] id v = v in
+  let v = id v in
+  check v 1L 2L
+;;
+
+(* Pass to function *)
+let () =
+  let v0 = vec128_of_int64s 1L 2L in
+  let v1 = vec128_of_int64s 3L 4L in
+  let v = combine v0 v1 in
+  check v 4L 6L
+;;
+
+(* Pass to function (inlined) *)
+let () =
+  let v0 = vec128_of_int64s 1L 2L in
+  let v1 = vec128_of_int64s 3L 4L in
+  let v = (combine[@inlined hint]) v0 v1 in
+  check v 4L 6L
+;;
+
+(* Pass to function with floats *)
+let () =
+  let v0 = vec128_of_int64s 1L 2L in
+  let v1 = vec128_of_int64s 3L 4L in
+  let f0 = Sys.opaque_identity 5. in
+  let v = combine_with_floats v0 f0 v1 6. in
+  check v 9L 12L
+;;
+
+(* Pass to function with floats (inlined) *)
+let () =
+  let v0 = vec128_of_int64s 1L 2L in
+  let v1 = vec128_of_int64s 3L 4L in
+  let v = (combine_with_floats[@inlined hint]) v0 5. v1 6. in
+  check v 9L 12L
+;;
+
+(* Capture in closure *)
+let () =
+  let v0 = vec128_of_int64s 1L 2L in
+  let v1 = vec128_of_int64s 3L 4L in
+  let f = combine v0 in
+  let f = Sys.opaque_identity f in
+  let v = f v1 in
+  check v 4L 6L
+;;
+
+(* Capture vectors and floats in a closure *)
+let () =
+  let[@inline never] f v0 v1 f0 v2 f1 v3 =
+    combine (combine_with_floats v0 f0 v1 f1) (combine v2 v3)
+  in
+  let v0 = vec128_of_int64s 1L 2L in
+  let v1 = vec128_of_int64s 3L 4L in
+  let v2 = vec128_of_int64s 4L 5L in
+  let v3 = vec128_of_int64s 6L 7L in
+  let f = f v0 v1 7. v2  in
+  let f = Sys.opaque_identity f in
+  let v = f 8. v3 in
+  check v 21L 26L
+;;
+
+(* Capture vectors and floats in a closure (inlined) *)
+let () =
+  let[@inline always] f v0 v1 f0 v2 f1 v3 =
+    (combine[@inlined hint])
+      ((combine_with_floats[@inlined hint]) v0 f0 v1 f1)
+      ((combine[@inlined hint]) v2 v3)
+  in
+  let v0 = vec128_of_int64s 1L 2L in
+  let v1 = vec128_of_int64s 3L 4L in
+  let v2 = vec128_of_int64s 4L 5L in
+  let v3 = vec128_of_int64s 6L 7L in
+  let f = f v0 v1 7. v2 in
+  let v = f 8. v3 in
+  check v 21L 26L
+;;
+
+(* Store in record *)
+type record = { a : vec128
+              ; mutable b : vec128
+              ; c : float }
+
+let () =
+  let record = { a = vec128_of_int64s 1L 2L; b = vec128_of_int64s 3L 4L; c = 5. } in
+  check record.a 1L 2L;
+  check record.b 3L 4L;
+  let record = Sys.opaque_identity record in
+  record.b <- vec128_of_int64s 5L 6L;
+  check record.a 1L 2L;
+  check record.b 5L 6L;
+  let v = combine_with_floats record.a record.c record.b 6. in
+  check v 11L 14L
+;;
+
+(* Store in variant *)
+type variant = A of vec128 | B of vec128 | C of float
+
+let () =
+  let variant = A (vec128_of_int64s 1L 2L) in
+  let variant = Sys.opaque_identity variant in
+  match variant with
+  | A v -> check v 1L 2L
+  | _ -> print_endline "fail";
+  let variant = ref (C 5.) in
+  let variant = Sys.opaque_identity variant in
+  variant := B (vec128_of_int64s 3L 4L);
+  match !variant with
+  | B v -> check v 3L 4L
+  | _ -> print_endline "fail"
+;;
+
+(* Pass boxed vectors to an external *)
+external boxed_combine : vec128 -> vec128 -> vec128 = "" "boxed_combine" [@@noalloc]
+
+let () =
+  let v0 = vec128_of_int64s 1L 2L in
+  let v1 = vec128_of_int64s 3L 4L in
+  let v = boxed_combine v0 v1 in
+  check v 4L 6L
+;;
+
+(* Pass lots of vectors to an external *)
+external lots_of_vectors :
+  vec128 -> vec128 -> vec128 -> vec128 -> vec128 -> vec128 -> vec128 -> vec128 ->
+  vec128 -> vec128 -> vec128 -> vec128 -> vec128 -> vec128 -> vec128 -> vec128 ->
+  vec128
+  = "" "lots_of_vectors" [@@noalloc] [@@unboxed]
+
+let () =
+    let v0 = vec128_of_int64s 1L 2L in
+    let v1 = vec128_of_int64s 3L 4L in
+    let v2 = vec128_of_int64s 5L 6L in
+    let v3 = vec128_of_int64s 7L 8L in
+    let v4 = vec128_of_int64s 9L 10L in
+    let v5 = vec128_of_int64s 11L 12L in
+    let v6 = vec128_of_int64s 13L 14L in
+    let v7 = vec128_of_int64s 15L 16L in
+    let v8 = vec128_of_int64s 17L 18L in
+    let v9 = vec128_of_int64s 19L 20L in
+    let v10 = vec128_of_int64s 21L 22L in
+    let v11 = vec128_of_int64s 23L 24L in
+    let v12 = vec128_of_int64s 25L 26L in
+    let v13 = vec128_of_int64s 27L 28L in
+    let v14 = vec128_of_int64s 29L 30L in
+    let v15 = vec128_of_int64s 31L 32L in
+    let v = lots_of_vectors v0 v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15 in
+    check v 256L 272L
+;;
+
+(* Pass mixed floats/vectors to an external *)
+external vectors_and_floats :
+  vec128 -> float -> vec128 -> float -> vec128 -> float -> vec128 -> float ->
+  float -> vec128 -> vec128 -> float -> float -> vec128 -> vec128 -> float ->
+  float -> float -> vec128 -> vec128 -> vec128 -> float -> float -> float ->
+  vec128
+  = "" "vectors_and_floats" [@@noalloc] [@@unboxed]
+
+let () =
+  let v0 = vec128_of_int64s 1L 2L in
+  let v1 = vec128_of_int64s 3L 4L in
+  let v2 = vec128_of_int64s 5L 6L in
+  let v3 = vec128_of_int64s 7L 8L in
+  let v4 = vec128_of_int64s 9L 10L in
+  let v5 = vec128_of_int64s 11L 12L in
+  let v6 = vec128_of_int64s 13L 14L in
+  let v7 = vec128_of_int64s 15L 16L in
+  let v8 = vec128_of_int64s 17L 18L in
+  let v9 = vec128_of_int64s 19L 20L in
+  let v10 = vec128_of_int64s 21L 22L in
+  let v = vectors_and_floats v0 23. v1 24. v2 25. v3 26. 27. v4 v5 28. 29. v6 v7 30. 31. 32. v8 v9 v10 33. 34. 35. in
+  check v 377L 253L
+;;
+
+(* Pass mixed ints/floats/vectors to an external *)
+external vectors_and_floats_and_ints :
+  vec128 -> float -> vec128 -> int64 -> vec128 -> float -> vec128 -> int64 ->
+  int64 -> vec128 -> vec128 -> float -> float -> vec128 -> vec128 -> int64 ->
+  int64 -> float -> vec128 -> vec128 -> vec128 -> int64 -> int64 -> float ->
+  vec128
+  = "" "vectors_and_floats_and_ints" [@@noalloc] [@@unboxed]
+
+let () =
+  let v0 = vec128_of_int64s 1L 2L in
+  let v1 = vec128_of_int64s 3L 4L in
+  let v2 = vec128_of_int64s 5L 6L in
+  let v3 = vec128_of_int64s 7L 8L in
+  let v4 = vec128_of_int64s 9L 10L in
+  let v5 = vec128_of_int64s 11L 12L in
+  let v6 = vec128_of_int64s 13L 14L in
+  let v7 = vec128_of_int64s 15L 16L in
+  let v8 = vec128_of_int64s 17L 18L in
+  let v9 = vec128_of_int64s 19L 20L in
+  let v10 = vec128_of_int64s 21L 22L in
+  let v = vectors_and_floats_and_ints v0 23. v1 24L v2 25. v3 26L 27L v4 v5 28. 29. v6 v7 30L 31L 32. v8 v9 v10 33L 34L 35. in
+  check v 377L 253L
+;;
+
+(* Vectors live across a probe handler *)
+let () =
+    let v0 = vec128_of_int64s 1L 2L in
+    let v1 = vec128_of_int64s 3L 4L in
+    let v2 = vec128_of_int64s 5L 6L in
+    let v3 = vec128_of_int64s 7L 8L in
+    let v4 = vec128_of_int64s 9L 10L in
+    let v5 = vec128_of_int64s 11L 12L in
+    let v6 = vec128_of_int64s 13L 14L in
+    let v7 = vec128_of_int64s 15L 16L in
+    let v8 = vec128_of_int64s 17L 18L in
+    let v9 = vec128_of_int64s 19L 20L in
+    let v10 = vec128_of_int64s 21L 22L in
+    let v11 = vec128_of_int64s 23L 24L in
+    let v12 = vec128_of_int64s 25L 26L in
+    let v13 = vec128_of_int64s 27L 28L in
+    let v14 = vec128_of_int64s 29L 30L in
+    let v15 = vec128_of_int64s 31L 32L in
+    [%probe "hello" ~enabled_at_init:true (
+      let xxx = vec128_of_int64s 0L 0L in
+      check xxx 0L 0L)];
+    let v = lots_of_vectors v0 v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15 in
+    check v 256L 272L
+;;
+
+let () =
+  let v0 = vec128_of_int64s 1L 2L in
+  let v1 = vec128_of_int64s 3L 4L in
+  let v2 = vec128_of_int64s 5L 6L in
+  let v3 = vec128_of_int64s 7L 8L in
+  let v4 = vec128_of_int64s 9L 10L in
+  let v5 = vec128_of_int64s 11L 12L in
+  let v6 = vec128_of_int64s 13L 14L in
+  let v7 = vec128_of_int64s 15L 16L in
+  let v8 = vec128_of_int64s 17L 18L in
+  let v9 = vec128_of_int64s 19L 20L in
+  let v10 = vec128_of_int64s 21L 22L in
+  [%probe "hello" ~enabled_at_init:true (
+      let xxx = vec128_of_int64s 0L 0L in
+      check xxx 0L 0L)];
+  let v = vectors_and_floats v0 23. v1 24. v2 25. v3 26. 27. v4 v5 28. 29. v6 v7 30. 31. 32. v8 v9 v10 33. 34. 35. in
+  check v 377L 253L
+;;
+
+let () =
+  let v0 = vec128_of_int64s 1L 2L in
+  let v1 = vec128_of_int64s 3L 4L in
+  let v2 = vec128_of_int64s 5L 6L in
+  let v3 = vec128_of_int64s 7L 8L in
+  let v4 = vec128_of_int64s 9L 10L in
+  let v5 = vec128_of_int64s 11L 12L in
+  let v6 = vec128_of_int64s 13L 14L in
+  let v7 = vec128_of_int64s 15L 16L in
+  let v8 = vec128_of_int64s 17L 18L in
+  let v9 = vec128_of_int64s 19L 20L in
+  let v10 = vec128_of_int64s 21L 22L in
+  [%probe "hello" ~enabled_at_init:true (
+      let xxx = vec128_of_int64s 0L 0L in
+      check xxx 0L 0L)];
+  let v = vectors_and_floats_and_ints v0 23. v1 24L v2 25. v3 26L 27L v4 v5 28. 29. v6 v7 30L 31L 32. v8 v9 v10 33L 34L 35. in
+  check v 377L 253L
+;;
diff --git a/tests/simd/stubs.c b/tests/simd/stubs.c
new file mode 100644
index 00000000000..edcb7692cfd
--- /dev/null
+++ b/tests/simd/stubs.c
@@ -0,0 +1,103 @@
+
+#include <caml/mlvalues.h>
+#include <caml/alloc.h>
+#include <caml/memory.h>
+#include <smmintrin.h>
+#include <emmintrin.h>
+
+int64_t vec128_low_int64(__m128i v)
+{
+  return _mm_extract_epi64(v, 0);
+}
+
+int64_t vec128_high_int64(__m128i v)
+{
+  return _mm_extract_epi64(v, 1);
+}
+
+__m128i vec128_of_int64s(int64_t low, int64_t high)
+{
+  return _mm_set_epi64x(high, low);
+}
+
+CAMLprim value boxed_combine(value v0, value v1)
+{
+  CAMLparam2(v0, v1);
+  CAMLlocal1(res);
+
+  __m128i l = _mm_loadu_si128((__m128i*)v0);
+  __m128i r = _mm_loadu_si128((__m128i*)v1);
+  __m128i result = _mm_add_epi64(l, r);
+  res = caml_alloc_small(2, Abstract_tag);
+  _mm_storeu_si128((__m128i*)res, result);
+
+  CAMLreturn(res);
+}
+
+__m128i lots_of_vectors(
+  __m128i v0, __m128i v1, __m128i v2, __m128i v3,
+  __m128i v4, __m128i v5, __m128i v6, __m128i v7,
+  __m128i v8, __m128i v9, __m128i v10, __m128i v11,
+  __m128i v12, __m128i v13, __m128i v14, __m128i v15)
+{
+  __m128i x0 = _mm_add_epi64(v0, v1);
+  __m128i x1 = _mm_add_epi64(v2, v3);
+  __m128i x2 = _mm_add_epi64(v4, v5);
+  __m128i x3 = _mm_add_epi64(v6, v7);
+  __m128i x4 = _mm_add_epi64(v8, v9);
+  __m128i x5 = _mm_add_epi64(v10, v11);
+  __m128i x6 = _mm_add_epi64(v12, v13);
+  __m128i x7 = _mm_add_epi64(v14, v15);
+  __m128i y0 = _mm_add_epi64(x0, x1);
+  __m128i y1 = _mm_add_epi64(x2, x3);
+  __m128i y2 = _mm_add_epi64(x4, x5);
+  __m128i y3 = _mm_add_epi64(x6, x7);
+  __m128i z0 = _mm_add_epi64(y0, y1);
+  __m128i z1 = _mm_add_epi64(y2, y3);
+  return _mm_add_epi64(z0, z1);
+}
+
+__m128i vectors_and_floats(
+  __m128i v0, double f0, __m128i v1, double f1,
+  __m128i v2, double f2, __m128i v3, double f3,
+  double f4, __m128i v4, __m128i v5, double f5,
+  double f6, __m128i v6, __m128i v7, double f7,
+  double f8, double f9, __m128i v8, __m128i v9,
+  __m128i v10, double f10, double f11, double f12)
+{
+  __m128i x0 = _mm_add_epi64(v0, v1);
+  __m128i x1 = _mm_add_epi64(v2, v3);
+  __m128i x2 = _mm_add_epi64(v4, v5);
+  __m128i x3 = _mm_add_epi64(v6, v7);
+  __m128i x4 = _mm_add_epi64(v8, v9);
+  __m128i y0 = _mm_add_epi64(x0, x1);
+  __m128i y1 = _mm_add_epi64(x2, x3);
+  __m128i y2 = _mm_add_epi64(v10, x4);
+  __m128i z0 = _mm_add_epi64(y0, y1);
+  __m128i z = _mm_add_epi64(z0, y2);
+  double f = f0 + f1 + f2 + f3 + f4 + f5 + f6 + f7 + f8 + f9 + f10 + f11 + f12;
+  return vec128_of_int64s((int64_t)f, vec128_low_int64(z) + vec128_high_int64(z));
+}
+
+__m128i vectors_and_floats_and_ints(
+  __m128i v0, double f0, __m128i v1, int64_t i0,
+  __m128i v2, double f1, __m128i v3, int64_t i1,
+  int64_t i2, __m128i v4, __m128i v5, double f2,
+  double f3, __m128i v6, __m128i v7, int64_t i3,
+  int64_t i4, double f4, __m128i v8, __m128i v9,
+  __m128i v10, int64_t i5, int64_t i6, double f5)
+{
+  __m128i x0 = _mm_add_epi64(v0, v1);
+  __m128i x1 = _mm_add_epi64(v2, v3);
+  __m128i x2 = _mm_add_epi64(v4, v5);
+  __m128i x3 = _mm_add_epi64(v6, v7);
+  __m128i x4 = _mm_add_epi64(v8, v9);
+  __m128i y0 = _mm_add_epi64(x0, x1);
+  __m128i y1 = _mm_add_epi64(x2, x3);
+  __m128i y2 = _mm_add_epi64(v10, x4);
+  __m128i z0 = _mm_add_epi64(y0, y1);
+  __m128i z = _mm_add_epi64(z0, y2);
+  double f = f0 + f1 + f2 + f3 + f4 + f5;
+  int64_t i = i0 + i1 + i2 + i3 + i4 + i5 + i6;
+  return vec128_of_int64s((int64_t)f + i, vec128_low_int64(z) + vec128_high_int64(z));
+}
diff --git a/testsuite/tests/unboxed-primitive-args/README b/testsuite/tests/unboxed-primitive-args/README
new file mode 100644
index 00000000000..4bd7601e511
--- /dev/null
+++ b/testsuite/tests/unboxed-primitive-args/README
@@ -0,0 +1,26 @@
+This directory contains tests to check that OCaml values are correctly
+passed between OCaml and C when a primitive takes some or all of its
+arguments unboxed/untagged and/or return its result unboxed/untagged.
+
+To test one primitive we do:
+- write all its argument and expected result in buffer A
+- call the C external using arguments read from buffer A
+- the C function write all the arguments it receive into buffer B
+- the C function read the result from buffer A and returns it
+- on the OCaml side we write the received result into buffer B
+- the test is successful if A and B have the same contents
+
+Between each call, we call a function with 128 value arguments set to
+0 and a function with 32 unboxed float arguments set to 0., just to
+clean-up the registers and stacks in case garbage would make a test
+succeed. We don't pass more floats as it doesn't build on arm32.
+
+We construct the set of primitives to test as follow:
+- all combination of unboxed int32/int64/float arguments for functions
+  taking up to 6 arguments (with more than 6 ocamlopt takes a really
+  long time to compile the test files)
+- a bunch of manual tests for the rest and specific patterns.
+  The list is [Gen_test.manual_tests]
+
+We test the set of primitives a thousand times, with different random
+data each time.
diff --git a/testsuite/tests/unboxed-primitive-args/common.ml b/testsuite/tests/unboxed-primitive-args/common.ml
new file mode 100644
index 00000000000..03b1f0543e2
--- /dev/null
+++ b/testsuite/tests/unboxed-primitive-args/common.ml
@@ -0,0 +1,304 @@
+open StdLabels
+
+open Bigarray
+
+type 'a typ =
+  | Int       : int       typ
+  | Int32     : int32     typ
+  | Int64     : int64     typ
+  | Nativeint : nativeint typ
+  | Float     : float     typ
+  | Vec128    : vec128    typ
+
+type 'a proto =
+  | Ret : 'a typ -> 'a proto
+  | Abs : 'a typ * 'b proto -> ('a -> 'b) proto
+
+let ( ** ) x y = Abs (x, y)
+
+(* This form is easier to process programmatically. We don't expose it as
+   ocamlopt takes a really really long time to compile a constant list
+   of these. *)
+type simplified_test = Test : string * 'a * 'a proto -> simplified_test
+
+type test =
+  | T1 : string * ('a -> 'b) * 'a typ * 'b typ -> test
+  | T2 : string * ('a -> 'b -> 'c) * 'a typ * 'b typ * 'c typ -> test
+  | T3 : string * ('a -> 'b -> 'c -> 'd) *
+         'a typ * 'b typ * 'c typ * 'd typ -> test
+  | T4 : string * ('a -> 'b -> 'c -> 'd -> 'e) *
+         'a typ * 'b typ * 'c typ * 'd typ * 'e typ -> test
+  | T5 : string * ('a -> 'b -> 'c -> 'd -> 'e -> 'f) *
+         'a typ * 'b typ * 'c typ * 'd typ * 'e typ * 'f typ -> test
+  | T6 : string * ('a -> 'b -> 'c -> 'd -> 'e -> 'f -> 'g) *
+         'a typ * 'b typ * 'c typ * 'd typ * 'e typ * 'f typ * 'g typ -> test
+  | T : string * 'a * 'a proto -> test
+
+let expand_test = function
+  | T1 (s, fn, a, b) -> Test (s, fn, a ** Ret b)
+  | T2 (s, fn, a, b, c) -> Test (s, fn, a ** b ** Ret c)
+  | T3 (s, fn, a, b, c, d) -> Test (s, fn, a ** b ** c ** Ret d)
+  | T4 (s, fn, a, b, c, d, e) -> Test (s, fn, a ** b ** c ** d ** Ret e)
+  | T5 (s, fn, a, b, c, d, e, f) ->
+    Test (s, fn, a ** b ** c ** d ** e ** Ret f)
+  | T6 (s, fn, a, b, c, d, e, f, g) ->
+    Test (s, fn, a ** b ** c ** d ** e ** f ** Ret g)
+  | T (s, fn, p) -> Test (s, fn, p)
+
+external vec128_of_int64s : int64 -> int64 -> vec128 = "" "vec128_of_int64s" [@@noalloc] [@@unboxed]
+external vec128_low_int64 : vec128 -> int64 = "" "vec128_low_int64" [@@noalloc] [@@unboxed]
+external vec128_high_int64 : vec128 -> int64 = "" "vec128_high_int64" [@@noalloc] [@@unboxed]
+
+let string_of : type a. a typ -> a -> string = function
+  | Int       -> Int.to_string
+  | Int32     -> Printf.sprintf "%ldl"
+  | Int64     -> Printf.sprintf "%LdL"
+  | Nativeint -> Printf.sprintf "%ndn"
+  | Float     ->
+      fun f -> Printf.sprintf "float_of_bits 0x%LxL" (Int64.bits_of_float f)
+  | Vec128    ->
+      fun v -> Printf.sprintf "vec128 %016Lx:%016Lx" (vec128_high_int64 v) (vec128_low_int64 v)
+
+let rec arity : type a. a proto -> int = function
+  | Ret _ -> 0
+  | Abs (_, p) -> 1 + arity p
+
+module Buffer = struct
+  type t = (char, int8_unsigned_elt, c_layout) Array1.t
+
+  let arg_size = 16
+
+  let create ~arity : t =
+    Array1.create char c_layout ((arity + 1) * arg_size)
+
+  let clear (t : t) = Array1.fill t '\000'
+
+  let length : t -> int = Array1.dim
+
+  external init_c_side : ocaml_buffer:t -> c_buffer:t -> unit
+    = "test_set_buffers"
+
+  external get_int32 : t -> int -> int32 = "%caml_bigstring_get32"
+  external get_int64 : t -> int -> int64 = "%caml_bigstring_get64"
+  external set_int32 : t -> int -> int32 -> unit = "%caml_bigstring_set32"
+  external set_int64 : t -> int -> int64 -> unit = "%caml_bigstring_set64"
+
+  let get_vec128 buf ~arg =
+    let low, high = get_int64 buf (arg * arg_size), get_int64 buf (arg * arg_size + 8) in
+    vec128_of_int64s low high
+
+  let set_vec128 buf ~arg x =
+    set_int64 buf (arg * arg_size) (vec128_low_int64 x);
+    set_int64 buf ((arg * arg_size) + 8) (vec128_high_int64 x)
+
+  let get_int32 t ~arg = get_int32 t (arg * arg_size)
+  let get_int64 t ~arg = get_int64 t (arg * arg_size)
+  let set_int32 t ~arg x = set_int32 t (arg * arg_size) x
+  let set_int64 t ~arg x = set_int64 t (arg * arg_size) x
+
+  let get_nativeint, set_nativeint =
+    match Sys.word_size with
+    | 32 -> ((fun t ~arg -> get_int32 t ~arg |> Nativeint.of_int32),
+             (fun t ~arg x -> set_int32 t ~arg (Nativeint.to_int32 x)))
+    | 64 -> ((fun t ~arg -> get_int64 t ~arg |> Int64.to_nativeint),
+             (fun t ~arg x -> set_int64 t ~arg (Int64.of_nativeint x)))
+    | n  -> Printf.ksprintf failwith "unknown word size (%d)" n
+
+  let get_int =
+    if Sys.word_size = 32 then
+      fun buf ~arg -> get_int32 buf ~arg |> Int32.to_int
+    else
+      fun buf ~arg -> get_int64 buf ~arg |> Int64.to_int
+
+  let set_int =
+    if Sys.word_size = 32 then
+      fun buf ~arg x -> set_int32 buf ~arg (Int32.of_int x)
+    else
+      fun buf ~arg x -> set_int64 buf ~arg (Int64.of_int x)
+
+  let get_float buf ~arg = get_int64 buf ~arg |> Int64.float_of_bits
+  let set_float buf ~arg x = set_int64 buf ~arg (Int64.bits_of_float x)
+
+  let get : type a. a typ -> t -> arg:int -> a = function
+    | Int       -> get_int
+    | Int32     -> get_int32
+    | Int64     -> get_int64
+    | Nativeint -> get_nativeint
+    | Float     -> get_float
+    | Vec128    -> get_vec128
+
+  let set : type a. a typ -> t -> arg:int -> a -> unit = function
+    | Int       -> set_int
+    | Int32     -> set_int32
+    | Int64     -> set_int64
+    | Nativeint -> set_nativeint
+    | Float     -> set_float
+    | Vec128    -> set_vec128
+
+  (* This is almost a memcpy except that we use get/set which should
+     ensure that the values in [dst] don't overflow. *)
+  let copy_args ~src ~dst proto =
+    let rec loop : type a. a proto -> int -> unit = fun proto arg ->
+      match proto with
+      | Ret typ ->
+        set typ dst ~arg (get typ src ~arg)
+      | Abs (typ, rest) ->
+        set typ dst ~arg (get typ src ~arg);
+        loop rest (arg + 1)
+    in
+    loop proto 0
+end
+
+let exec proto f ~ocaml_buffer ~c_buffer =
+  let rec loop : type a. a proto -> a -> int -> unit = fun proto f arg ->
+    match proto with
+    | Ret typ ->
+      Buffer.set typ c_buffer ~arg f
+    | Abs (typ, rest) ->
+      let x = Buffer.get typ ocaml_buffer ~arg in
+      loop rest (f x) (arg + 1)
+  in
+  loop proto f 0
+
+let strings_of_test_instance name proto buffer =
+  let rec loop : type a. a proto -> int -> string list -> string list * string =
+    fun proto arg acc ->
+      match proto with
+      | Ret typ ->
+        (List.rev acc, string_of typ (Buffer.get typ buffer ~arg))
+      | Abs (typ, rest) ->
+        let s = string_of typ (Buffer.get typ buffer ~arg) in
+        loop rest (arg + 1) (s :: acc)
+  in
+  loop proto 0 []
+
+let typ_size : type a. a typ -> int = function
+  | Int       -> Sys.word_size / 8
+  | Int32     -> 4
+  | Int64     -> 8
+  | Nativeint -> Sys.word_size / 8
+  | Float     -> 8
+  | Vec128    -> 16
+
+let rec sizes : type a. a proto -> int list = function
+  | Ret typ         -> [typ_size typ]
+  | Abs (typ, rest) -> typ_size typ :: sizes rest
+
+let print_hex ~sizes ~arity buffer =
+  let printf = Printf.printf in
+  printf "(";
+  for i = 0 to arity do
+    if i = arity then
+      printf ") -> "
+    else if i > 0 then
+      printf ", ";
+    for ofs = i * Buffer.arg_size to i * Buffer.arg_size + sizes.(i) - 1 do
+      printf "%02x" (Char.code buffer.{ofs});
+    done;
+  done
+
+let printed_mismatches = ref 0
+
+let print_mismatch name proto ~ocaml_buffer ~c_buffer =
+  let printf = Printf.printf in
+  printf "Mismatch for %s\n" name;
+  let o_args, o_res = strings_of_test_instance name proto ocaml_buffer in
+  let c_args, c_res = strings_of_test_instance name proto     c_buffer in
+  let o_args, c_args =
+    (* Align arguments *)
+    List.map2 o_args c_args ~f:(fun a b ->
+      let len_a = String.length a and len_b = String.length b in
+      let len = max len_a len_b in
+      (Printf.sprintf "%*s" len a,
+       Printf.sprintf "%*s" len b))
+    |> List.split
+  in
+  printf "ocaml side : (%s) -> %s\n" (String.concat ~sep:", " o_args) o_res;
+  printf "c side     : (%s) -> %s\n" (String.concat ~sep:", " c_args) c_res;
+  let sizes = sizes proto |> Array.of_list in
+  let arity = arity proto in
+  printf "ocaml side : "; print_hex ~sizes ~arity ocaml_buffer; printf "\n";
+  printf "c side     : "; print_hex ~sizes ~arity     c_buffer; printf "\n";
+  incr printed_mismatches;
+  if !printed_mismatches >= 1000 then begin
+    printf "Output truncated at 1000 failures.";
+    exit 0
+  end
+
+external cleanup_normal
+  :  int -> int -> int -> int -> int -> int -> int -> int
+  -> int -> int -> int -> int -> int -> int -> int -> int
+  -> int -> int -> int -> int -> int -> int -> int -> int
+  -> int -> int -> int -> int -> int -> int -> int -> int
+  -> int -> int -> int -> int -> int -> int -> int -> int
+  -> int -> int -> int -> int -> int -> int -> int -> int
+  -> int -> int -> int -> int -> int -> int -> int -> int
+  -> int -> int -> int -> int -> int -> int -> int -> int
+  -> int -> int -> int -> int -> int -> int -> int -> int
+  -> int -> int -> int -> int -> int -> int -> int -> int
+  -> int -> int -> int -> int -> int -> int -> int -> int
+  -> int -> int -> int -> int -> int -> int -> int -> int
+  -> int -> int -> int -> int -> int -> int -> int -> int
+  -> int -> int -> int -> int -> int -> int -> int -> int
+  -> int -> int -> int -> int -> int -> int -> int -> int
+  -> int -> int -> int -> int -> int -> int -> int -> int
+  -> int = "" "test_cleanup_normal" [@@noalloc]
+
+external cleanup_float
+  :  float -> float -> float -> float -> float -> float -> float -> float
+  -> float -> float -> float -> float -> float -> float -> float -> float
+  -> float -> float -> float -> float -> float -> float -> float -> float
+  -> float -> float -> float -> float -> float -> float -> float -> float
+  -> float = "" "test_cleanup_float" [@@noalloc] [@@unboxed]
+
+let cleanup_args_and_stack () =
+  let _ : int =
+    cleanup_normal
+      0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+      0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+      0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+      0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+  in
+  let _ : float =
+    cleanup_float
+       0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
+       0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
+  in
+  ()
+
+let run_test ~random_data ~ocaml_buffer ~c_buffer (Test (name, f, proto)) =
+  Buffer.clear ocaml_buffer;
+  Buffer.clear c_buffer;
+  Buffer.copy_args ~src:random_data ~dst:ocaml_buffer proto;
+  cleanup_args_and_stack ();
+  exec proto f ~ocaml_buffer ~c_buffer;
+  let success = ocaml_buffer = c_buffer in
+  if not success then print_mismatch name proto ~ocaml_buffer ~c_buffer;
+  success
+
+let run_tests tests =
+  let tests = List.map tests ~f:expand_test in
+  let max_args =
+    List.fold_left tests ~init:0 ~f:(fun acc (Test (_, _, p)) ->
+      max acc (arity p))
+  in
+
+  let ocaml_buffer = Buffer.create ~arity:max_args
+  and     c_buffer = Buffer.create ~arity:max_args in
+  Buffer.init_c_side ~ocaml_buffer ~c_buffer;
+
+  let random_data = Buffer.create ~arity:max_args in
+  let new_random_data () =
+    for i = 0 to Buffer.length random_data - 1 do
+      random_data.{i} <- char_of_int (Random.int 256)
+    done
+  in
+
+  let failure = ref false in
+  for i = 1 to 1000 do
+    new_random_data ();
+    List.iter tests ~f:(fun test ->
+      if not (run_test ~random_data ~ocaml_buffer ~c_buffer test) then
+        failure := true)
+  done;
+  exit (if !failure then 1 else 0)
diff --git a/testsuite/tests/unboxed-primitive-args/common.mli b/testsuite/tests/unboxed-primitive-args/common.mli
new file mode 100644
index 00000000000..362ef4f916b
--- /dev/null
+++ b/testsuite/tests/unboxed-primitive-args/common.mli
@@ -0,0 +1,30 @@
+(** Type of arguments/result *)
+type 'a typ =
+  | Int       : int       typ
+  | Int32     : int32     typ
+  | Int64     : int64     typ
+  | Nativeint : nativeint typ
+  | Float     : float     typ
+  | Vec128    : vec128    typ
+
+type 'a proto =
+  | Ret : 'a typ -> 'a proto
+  | Abs : 'a typ * 'b proto -> ('a -> 'b) proto
+
+(** Same as [Abs]. We choose this operator for its associativity. *)
+val ( ** ) : 'a typ -> 'b proto -> ('a -> 'b) proto
+
+type test =
+  | T1 : string * ('a -> 'b) * 'a typ * 'b typ -> test
+  | T2 : string * ('a -> 'b -> 'c) * 'a typ * 'b typ * 'c typ -> test
+  | T3 : string * ('a -> 'b -> 'c -> 'd) *
+         'a typ * 'b typ * 'c typ * 'd typ -> test
+  | T4 : string * ('a -> 'b -> 'c -> 'd -> 'e) *
+         'a typ * 'b typ * 'c typ * 'd typ * 'e typ -> test
+  | T5 : string * ('a -> 'b -> 'c -> 'd -> 'e -> 'f) *
+         'a typ * 'b typ * 'c typ * 'd typ * 'e typ * 'f typ -> test
+  | T6 : string * ('a -> 'b -> 'c -> 'd -> 'e -> 'f -> 'g) *
+         'a typ * 'b typ * 'c typ * 'd typ * 'e typ * 'f typ * 'g typ -> test
+  | T : string * 'a * 'a proto -> test
+
+val run_tests : test list -> unit
diff --git a/testsuite/tests/unboxed-primitive-args/gen_test.ml b/testsuite/tests/unboxed-primitive-args/gen_test.ml
new file mode 100644
index 00000000000..80e11178296
--- /dev/null
+++ b/testsuite/tests/unboxed-primitive-args/gen_test.ml
@@ -0,0 +1,249 @@
+(* This programs generate stubs with various prototype combinations *)
+
+open StdLabels
+
+type boxed_integer = Pnativeint | Pint32 | Pint64
+
+type boxed_vector = Pvec128
+
+type native_repr =
+  | Same_as_ocaml_repr
+  | Unboxed_float
+  | Unboxed_integer of boxed_integer
+  | Untagged_int
+  | Unboxed_vector of boxed_vector
+
+(* Generate primitives with up to this number of arguments *)
+let test_all_combination_up_to_n_args = 6
+
+(* Generate primitives using all combination of these argument
+   representations. No need to test all combination of other
+   representations: regarding the calling convention
+   [Same_as_ocaml_repr], [Untagged_int] and
+   [Unboxed_integer Pnativeint] are all the same, and are the
+   same as [Unboxed_integer Pint<word-size>].
+
+   We have specific tests for the other representations and for the
+   result representation in [manual_tests].
+*)
+let test_all_args_combination_of =
+  [ Unboxed_float
+  ; Unboxed_integer Pint32
+  ; Unboxed_integer Pint64
+  ; Unboxed_vector Pvec128
+  ]
+
+let code_of_repr = function
+  | Same_as_ocaml_repr         -> "v" (* for "value" *)
+  | Unboxed_float              -> "f"
+  | Unboxed_integer Pint32     -> "l"
+  | Unboxed_integer Pint64     -> "L"
+  | Unboxed_integer Pnativeint -> "n"
+  | Untagged_int               -> "i"
+  | Unboxed_vector Pvec128     -> "x" (* for "xmm" *)
+
+let repr_of_code = function
+  | 'v' -> Same_as_ocaml_repr
+  | 'f' -> Unboxed_float
+  | 'l' -> Unboxed_integer Pint32
+  | 'L' -> Unboxed_integer Pint64
+  | 'n' -> Unboxed_integer Pnativeint
+  | 'i' -> Untagged_int
+  | 'x' -> Unboxed_vector Pvec128
+  | _   -> assert false
+
+let manual_tests =
+  [ "v_v"
+  ; "f_f"
+  ; "l_l"
+  ; "L_L"
+  ; "n_n"
+  ; "i_i"
+  ; "x_x"
+  ; "f_fffff"
+  ; "f_ffffff"
+  ; "f_fffffff"
+  ; "f_fffffffffffffffff"
+  ; "x_xxxxx"
+  ; "x_xxxxxx"
+  ; "x_xxxxxxx"
+  ; "x_xxxxxxxxxxxxxxxxx"
+  ; "v_iiiiiiiiiiiiiiiii"
+  ; "v_lllllllllllllllll"
+  ; "v_LLLLLLLLLLLLLLLLL"
+  ; "v_iLiLiLiLiLiLiLiLi"
+  ; "v_LiLiLiLiLiLiLiLiL"
+  ; "v_flflflflflflflflflflflflflflflflflfl"
+  ; "v_fLfLfLfLfLfLfLfLfLfLfLfLfLfLfLfLfLfL"
+  ; "v_xfxfxfxfxfxfxfxfx"
+  ; "v_fxfxfxfxfxfxfxfxf"
+  ; "v_lfxlfxlfxlfxlfxlfx"
+  ; "v_lflxlxlflflxlxlflx"
+  ; "v_llllllfffffflxxllxx"
+  ]
+
+let ocaml_type_of_repr = function
+  (* Doesn't really matters what we choose for this case *)
+  | Same_as_ocaml_repr         -> "int"
+  | Unboxed_float              -> "(float [@unboxed])"
+  | Unboxed_integer Pint32     -> "(int32 [@unboxed])"
+  | Unboxed_integer Pint64     -> "(int64 [@unboxed])"
+  | Unboxed_integer Pnativeint -> "(nativeint [@unboxed])"
+  | Untagged_int               -> "(int [@untagged])"
+  | Unboxed_vector Pvec128     -> "(vec128 [@unboxed])"
+
+let ocaml_type_gadt_of_repr = function
+  (* Doesn't really matters what we choose for this case *)
+  | Same_as_ocaml_repr         -> "Int"
+  | Unboxed_float              -> "Float"
+  | Unboxed_integer Pint32     -> "Int32"
+  | Unboxed_integer Pint64     -> "Int64"
+  | Unboxed_integer Pnativeint -> "Nativeint"
+  | Untagged_int               -> "Int"
+  | Unboxed_vector Pvec128     -> "Vec128"
+
+let c_type_of_repr = function
+  | Same_as_ocaml_repr         -> "value"
+  | Unboxed_float              -> "double"
+  | Unboxed_integer Pint32     -> "int32_t"
+  | Unboxed_integer Pint64     -> "int64_t"
+  | Unboxed_integer Pnativeint -> "intnat"
+  | Untagged_int               -> "intnat"
+  | Unboxed_vector Pvec128     -> "__m128i"
+
+type proto =
+  { params : native_repr list
+  ; return : native_repr
+  }
+
+let rec explode s =
+  let rec loop i acc =
+    if i < 0 then
+      acc
+    else
+      loop (i - 1) (s.[i] :: acc)
+  in
+  loop (String.length s - 1) []
+
+let proto_of_str s =
+  Scanf.sscanf s "%c_%s" (fun return params ->
+    { params = List.map (explode params) ~f:repr_of_code
+    ; return = repr_of_code return
+    })
+
+let function_name_of_proto proto =
+  Printf.sprintf "test_%s_%s" (code_of_repr proto.return)
+    (String.concat ~sep:"" (List.map proto.params ~f:code_of_repr))
+
+let ocaml_type_gadt_of_proto proto =
+  Printf.sprintf "%s ** Ret %s"
+    (String.concat ~sep:" ** "
+       (List.map proto.params ~f:ocaml_type_gadt_of_repr))
+    (ocaml_type_gadt_of_repr proto.return)
+
+let ocaml_type_of_proto proto =
+  String.concat ~sep:" -> "
+    (List.map proto.params ~f:ocaml_type_of_repr
+     @ [ocaml_type_of_repr proto.return])
+
+let c_args_of_proto proto =
+  String.concat ~sep:", "
+    (List.mapi proto.params ~f:(fun i p ->
+       Printf.sprintf "%s x%d" (c_type_of_repr p) i))
+
+let manual_protos = List.map manual_tests ~f:proto_of_str
+
+let iter_protos ~f =
+  let iter_for_arity arity =
+    let rec loop params to_gen =
+      List.iter test_all_args_combination_of ~f:(fun repr ->
+        let params = repr :: params in
+        let to_gen = to_gen - 1 in
+        if to_gen = 0 then
+          f { params = List.rev params
+            ; return = Same_as_ocaml_repr
+            }
+        else
+          loop params to_gen)
+    in
+    loop [] arity
+  in
+  let rec iter_arities arity =
+    if arity <= test_all_combination_up_to_n_args then begin
+      iter_for_arity arity;
+      iter_arities (arity + 1)
+    end
+  in
+  List.iter manual_protos ~f;
+  iter_arities 1
+
+let pr fmt = Printf.ksprintf (fun s -> print_string s; print_char '\n') fmt
+
+let generate_ml () =
+  pr "open Common";
+  pr "";
+  iter_protos ~f:(fun proto ->
+    let name = function_name_of_proto proto in
+    pr "external %s : %s = \"\" %S [@@noalloc]"
+      name (ocaml_type_of_proto proto) name;
+  );
+  pr "";
+  pr "let tests = []";
+  iter_protos ~f:(fun proto ->
+    let name = function_name_of_proto proto in
+    let arity = List.length proto.params in
+    if arity <= 6 then
+      pr "let tests = T%d (%S, %s, %s, %s) :: tests"
+        arity name name
+        (List.map proto.params ~f:ocaml_type_gadt_of_repr
+         |> String.concat ~sep:", ")
+        (ocaml_type_gadt_of_repr proto.return)
+    else
+      pr "let tests = T (%S, %s, %s) :: tests"
+        name name (ocaml_type_gadt_of_proto proto));
+  pr "";
+  pr "let () = run_tests (List.rev tests)"
+
+let generate_stubs () =
+  pr "#include <stdio.h>";
+  pr "#include <caml/mlvalues.h>";
+  pr "#include \"test_common.h\"";
+  iter_protos ~f:(fun proto ->
+      let name = function_name_of_proto proto in
+      pr "";
+      pr "%s %s(%s)"
+        (c_type_of_repr proto.return)
+        name
+        (c_args_of_proto proto);
+      pr "{";
+      List.iteri proto.params ~f:(fun i p ->
+        pr "  %(%d%d%);"
+          (match p with
+           | Same_as_ocaml_repr         -> "set_intnat(%d, Long_val(x%d))"
+           | Unboxed_float              -> "set_double(%d, x%d)"
+           | Unboxed_integer Pint32     -> "set_int32(%d, x%d)"
+           | Unboxed_integer Pint64     -> "set_int64(%d, x%d)"
+           | Unboxed_integer Pnativeint -> "set_intnat(%d, x%d)"
+           | Untagged_int               -> "set_intnat(%d, x%d)"
+           | Unboxed_vector Pvec128     -> "set_vec128(%d, x%d)")
+          i i);
+      pr "  return %(%d%);"
+        (match proto.return with
+         | Same_as_ocaml_repr         -> "Val_long(get_intnat(%d))"
+         | Unboxed_float              -> "get_double(%d)"
+         | Unboxed_integer Pint32     -> "get_int32(%d)"
+         | Unboxed_integer Pint64     -> "get_int64(%d)"
+         | Unboxed_integer Pnativeint -> "get_intnat(%d)"
+         | Untagged_int               -> "get_intnat(%d)"
+         | Unboxed_vector Pvec128     -> "get_vec128(%d)")
+        (List.length proto.params);
+      pr "}"
+  )
+
+let () =
+  match Sys.argv with
+  | [|_; "ml"|] -> generate_ml ()
+  | [|_; "c" |] -> generate_stubs ()
+  | _ ->
+    prerr_endline "Usage: ocaml gen_test.ml {ml|c}";
+    exit 2
diff --git a/testsuite/tests/unboxed-primitive-args/test.ml b/testsuite/tests/unboxed-primitive-args/test.ml
new file mode 100644
index 00000000000..a7b75037919
--- /dev/null
+++ b/testsuite/tests/unboxed-primitive-args/test.ml
@@ -0,0 +1,24 @@
+(* TEST
+
+readonly_files = "common.mli common.ml test_common.c test_common.h"
+
+* setup-ocamlopt.opt-build-env
+** ocaml
+test_file = "${test_source_directory}/gen_test.ml"
+ocaml_script_as_argument = "true"
+arguments = "c"
+compiler_output = "stubs.c"
+*** ocaml
+arguments = "ml"
+compiler_output = "main.ml"
+**** script
+script = "${cc} -msse4.2 -c test_common.c -I ../../../../../../../../runtime"
+***** script
+script = "${cc} -msse4.2 -c stubs.c -I ../../../../../../../../runtime"
+****** ocamlopt.opt
+ocamlopt_flags = "-fsimd"
+all_modules = "test_common.o stubs.o common.mli common.ml main.ml"
+******* run
+******** check-program-output
+
+*)
diff --git a/testsuite/tests/unboxed-primitive-args/test.reference b/testsuite/tests/unboxed-primitive-args/test.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/testsuite/tests/unboxed-primitive-args/test_common.c b/testsuite/tests/unboxed-primitive-args/test_common.c
new file mode 100644
index 00000000000..9f453c354ce
--- /dev/null
+++ b/testsuite/tests/unboxed-primitive-args/test_common.c
@@ -0,0 +1,54 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                OCaml                                   */
+/*                                                                        */
+/*                  Jeremie Dimino, Jane Street Europe                    */
+/*                                                                        */
+/*   Copyright 2015 Institut National de Recherche en Informatique et     */
+/*     en Automatique.                                                    */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+
+#include <caml/mlvalues.h>
+#include <caml/bigarray.h>
+#include <smmintrin.h>
+#include <emmintrin.h>
+
+char *ocaml_buffer;
+char *c_buffer;
+
+value test_set_buffers(value v_ocaml_buffer, value v_c_buffer)
+{
+  ocaml_buffer = Caml_ba_data_val(v_ocaml_buffer);
+  c_buffer = Caml_ba_data_val(v_c_buffer);
+  return Val_unit;
+}
+
+value test_cleanup_normal(void)
+{
+  return Val_int(0);
+}
+
+double test_cleanup_float(void)
+{
+  return 0.;
+}
+
+int64_t vec128_low_int64(__m128i v)
+{
+  return _mm_extract_epi64(v, 0);
+}
+
+int64_t vec128_high_int64(__m128i v)
+{
+  return _mm_extract_epi64(v, 1);
+}
+
+__m128i vec128_of_int64s(int64_t low, int64_t high)
+{
+  return _mm_set_epi64x(high, low);
+}
diff --git a/testsuite/tests/unboxed-primitive-args/test_common.h b/testsuite/tests/unboxed-primitive-args/test_common.h
new file mode 100644
index 00000000000..fb3c08c86be
--- /dev/null
+++ b/testsuite/tests/unboxed-primitive-args/test_common.h
@@ -0,0 +1,50 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                OCaml                                   */
+/*                                                                        */
+/*                  Jeremie Dimino, Jane Street Europe                    */
+/*                                                                        */
+/*   Copyright 2015 Institut National de Recherche en Informatique et     */
+/*     en Automatique.                                                    */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+
+#ifndef __TEST_COMMON_H
+#define __TEST_COMMON_H
+
+#include <immintrin.h>
+
+/* Where the OCaml side stores the arguments and result for a test
+   case. The C function will read the result it is supposed to return
+   from this buffer.
+
+   Argument [n] is stored at [n * 8] and the result is stored at
+   [arity * 8].
+*/
+extern char *ocaml_buffer;
+
+/* Where the C function stores the arguments it receive for a test
+   case. The OCaml side will store the result from the C function in
+   this buffer. At the of a test case, both these buffers must be
+   equal. */
+extern char *c_buffer;
+
+#define STRIDE 16
+
+#define get_intnat(n) *(intnat*)(ocaml_buffer+((n)*STRIDE))
+#define get_int32(n) *(int32_t*)(ocaml_buffer+((n)*STRIDE))
+#define get_int64(n) *(int64_t*)(ocaml_buffer+((n)*STRIDE))
+#define get_double(n) *(double*)(ocaml_buffer+((n)*STRIDE))
+#define get_vec128(n) _mm_loadu_si128((__m128i*)(ocaml_buffer+((n)*STRIDE)))
+
+#define set_intnat(n, x) *(intnat*)(c_buffer+((n)*STRIDE)) = (x)
+#define set_int32(n, x) *(int32_t*)(c_buffer+((n)*STRIDE)) = (x)
+#define set_int64(n, x) *(int64_t*)(c_buffer+((n)*STRIDE)) = (x)
+#define set_double(n, x) *(double*)(c_buffer+((n)*STRIDE)) = (x)
+#define set_vec128(n, x) _mm_storeu_si128((__m128i*)(c_buffer+((n)*STRIDE)), (x))
+
+#endif /* __TEST_COMMON_H */