diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py
index 7f02f60..e1c6a82 100644
--- a/src/finn/core/rtlsim_exec.py
+++ b/src/finn/core/rtlsim_exec.py
@@ -30,12 +30,7 @@
 
 from finn.custom_op.registry import getCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-from finn.util.pyverilator import (
-    pyverilate_get_liveness_threshold_cycles,
-    pyverilate_stitched_ip,
-    reset_rtlsim,
-    toggle_clk,
-)
+from finn.util.pyverilator import pyverilate_stitched_ip, reset_rtlsim, rtlsim_multi_io
 
 try:
     from pyverilator import PyVerilator
@@ -51,7 +46,6 @@ def rtlsim_exec(model, execution_context, pre_hook=None, post_hook=None):
     - pre_hook : hook function to be called before sim start (after reset)
     - post_hook : hook function to be called after sim end
     """
-
     if PyVerilator is None:
         raise ImportError("Installation of PyVerilator is required.")
     # ensure stitched ip project already exists
@@ -64,135 +58,87 @@ def rtlsim_exec(model, execution_context, pre_hook=None, post_hook=None):
     ), """The
     directory from metadata property "vivado_stitch_proj" doesn't exist"""
     trace_file = model.get_metadata_prop("rtlsim_trace")
-    # extract input shape
-    # TODO extend for multiple inputs
-    i_name = model.graph.input[0].name
-    i_tensor = execution_context[i_name]
-    i_dt = model.get_tensor_datatype(i_name)
-    first_node = getCustomOp(model.find_consumer(i_name))
-    i_stream_w = first_node.get_instream_width()
-    # convert input into time multiplexed shape
-    i_folded_shape = first_node.get_folded_input_shape()
-    batchsize = i_tensor.shape[0]
-    # override batch size for input
-    i_folded_shape = list(i_folded_shape)
-    i_folded_shape[0] = batchsize
-    i_folded_shape = tuple(i_folded_shape)
-    # TODO any other layout transformations need to happen here!
-    i_tensor = i_tensor.reshape(i_folded_shape)
-    # extract output shape
-    o_name = model.graph.output[0].name
-    o_shape = model.get_tensor_shape(o_name)
-    o_dt = model.get_tensor_datatype(o_name)
-    last_node = getCustomOp(model.find_producer(o_name))
-    o_folded_shape = last_node.get_folded_output_shape()
-    # override batch size from actual input
-    o_shape = list(o_shape)
-    o_shape[0] = batchsize
-    o_shape = tuple(o_shape)
-    o_folded_shape = list(o_folded_shape)
-    o_folded_shape[0] = batchsize
-    o_folded_shape = tuple(o_folded_shape)
-    o_stream_w = last_node.get_outstream_width()
-    packedBits = o_stream_w
-    targetBits = o_dt.bitwidth()
-    # pack input
-    packed_input = npy_to_rtlsim_input(i_tensor, i_dt, i_stream_w)
-    num_out_values = last_node.get_number_output_values()
-    num_out_values *= batchsize
+    if trace_file is None:
+        trace_file = ""
+    extra_verilator_args = model.get_metadata_prop("extra_verilator_args")
+    if extra_verilator_args is None:
+        extra_verilator_args = []
+    else:
+        extra_verilator_args = eval(extra_verilator_args)
+
+    # extract i/o info to prepare io_dict
+    io_dict = {"inputs": {}, "outputs": {}}
+    if_dict = eval(model.get_metadata_prop("vivado_stitch_ifnames"))
+    # go over and prepare inputs
+    for i, i_vi in enumerate(model.graph.input):
+        i_name = i_vi.name
+        i_tensor = execution_context[i_name]
+        i_dt = model.get_tensor_datatype(i_name)
+        first_node = getCustomOp(model.find_consumer(i_name))
+        i_stream_w = first_node.get_instream_width()
+        # convert input into time multiplexed shape
+        i_folded_shape = first_node.get_folded_input_shape()
+        batchsize = i_tensor.shape[0]
+        # override batch size for input
+        i_folded_shape = list(i_folded_shape)
+        i_folded_shape[0] = batchsize
+        i_folded_shape = tuple(i_folded_shape)
+        # TODO any other layout transformations need to happen here!
+        i_tensor = i_tensor.reshape(i_folded_shape)
+        # pack input for rtlsim
+        packed_input = npy_to_rtlsim_input(i_tensor, i_dt, i_stream_w)
+        # add to io_dict
+        if_name = if_dict["s_axis"][i][0]
+        io_dict["inputs"][if_name] = packed_input
+    # go over outputs to determine how many values will be produced
+    num_out_values = 0
+    o_tensor_info = []
+    for o, o_vi in enumerate(model.graph.output):
+        # output in io_dict just needs an empty list
+        if_name = if_dict["m_axis"][o][0]
+        io_dict["outputs"][if_name] = []
+        # extract output shape
+        o_name = o_vi.name
+        o_shape = model.get_tensor_shape(o_name)
+        o_dt = model.get_tensor_datatype(o_name)
+        last_node = getCustomOp(model.find_producer(o_name))
+        o_folded_shape = last_node.get_folded_output_shape()
+        # override batch size from actual input
+        o_shape = list(o_shape)
+        o_shape[0] = batchsize
+        o_shape = tuple(o_shape)
+        o_folded_shape = list(o_folded_shape)
+        o_folded_shape[0] = batchsize
+        o_folded_shape = tuple(o_folded_shape)
+        o_stream_w = last_node.get_outstream_width()
+        o_tensor_info.append((o_stream_w, o_dt, o_folded_shape, o_shape))
+        num_out_values += batchsize * last_node.get_number_output_values()
+
     # prepare pyverilator model
     rtlsim_so = model.get_metadata_prop("rtlsim_so")
     if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)):
-        sim = pyverilate_stitched_ip(model)
+        sim = pyverilate_stitched_ip(model, extra_verilator_args=extra_verilator_args)
         model.set_metadata_prop("rtlsim_so", sim.lib._name)
     else:
         sim = PyVerilator(rtlsim_so, auto_eval=False)
-    ret = _run_rtlsim(
-        sim,
-        packed_input,
-        num_out_values,
-        trace_file,
-        pre_hook=pre_hook,
-        post_hook=post_hook,
-    )
-    packed_output = ret[0]
-    model.set_metadata_prop("cycles_rtlsim", str(ret[1]))
-    # unpack output and put into context
-    o_folded_tensor = rtlsim_output_to_npy(
-        packed_output, None, o_dt, o_folded_shape, packedBits, targetBits
-    )
-    execution_context[o_name] = o_folded_tensor.reshape(o_shape)
-
-
-def _run_rtlsim(
-    sim, inp, num_out_values, trace_file=None, reset=True, pre_hook=None, post_hook=None
-):
-    """Runs the pyverilator simulation by passing the input values to the simulation,
-    toggle the clock and observing the execution time. Argument num_out_values contains
-    the number of expected output values, so the simulation is closed after all
-    outputs are calculated. Function contains also an observation loop that can
-    abort the simulation if no output value is produced after a certain time
-    (liveness_threshold from function pyverilate_get_liveness_threshold_cycles()
-    from finn.util.fpgadataflow)"""
-    inputs = inp
-    outputs = []
-    sim.io.m_axis_0_tready = 1
-
-    # observe if output is completely calculated
-    # observation_count will contain the number of cycles the calculation ran
-    output_observed = False
-    observation_count = 0
-
-    # avoid infinite looping of simulation by aborting when there is no change in
-    # output values after LIVENESS_THRESHOLD cycles
-    no_change_count = 0
-    old_outputs = outputs
-    liveness_threshold = pyverilate_get_liveness_threshold_cycles()
-
-    if trace_file is not None:
-        sim.start_vcd_trace(trace_file)
-    if reset:
-        reset_rtlsim(sim)
 
+    # reset and call rtlsim, including any pre/post hooks
+    reset_rtlsim(sim)
     if pre_hook is not None:
         pre_hook(sim)
-
-    # TODO use utils.fpgadataflow.rtlsim_multi_io instead of manual code below
-    while not (output_observed):
-        sim.io.s_axis_0_tvalid = 1 if len(inputs) > 0 else 0
-        sim.io.s_axis_0_tdata = inputs[0] if len(inputs) > 0 else 0
-        if sim.io.s_axis_0_tready == 1 and sim.io.s_axis_0_tvalid == 1:
-            inputs = inputs[1:]
-        if sim.io.m_axis_0_tvalid == 1 and sim.io.m_axis_0_tready == 1:
-            outputs = outputs + [sim.io.m_axis_0_tdata]
-        toggle_clk(sim)
-
-        observation_count = observation_count + 1
-        no_change_count = no_change_count + 1
-
-        if len(outputs) == num_out_values:
-            cycles_rtlsim = observation_count
-            output_observed = True
-
-        if no_change_count == liveness_threshold:
-            if old_outputs == outputs:
-                if trace_file is not None:
-                    sim.flush_vcd_trace()
-                    sim.stop_vcd_trace()
-                raise Exception(
-                    "Error in simulation! Takes too long to produce output."
-                    "Consider setting the LIVENESS_THRESHOLD env.var. to a "
-                    "larger value."
-                )
-            else:
-                no_change_count = 0
-                old_outputs = outputs
-
+    n_cycles = rtlsim_multi_io(sim, io_dict, num_out_values, trace_file, sname="_")
     if post_hook is not None:
         post_hook(sim)
 
-    if trace_file is not None:
-        sim.flush_vcd_trace()
-        sim.stop_vcd_trace()
-
-    return (outputs, cycles_rtlsim)
+    # unpack outputs and put back into execution context
+    for o, o_vi in enumerate(model.graph.output):
+        o_name = o_vi.name
+        if_name = if_dict["m_axis"][o][0]
+        o_stream_w, o_dt, o_folded_shape, o_shape = o_tensor_info[o]
+        packed_output = io_dict["outputs"][if_name]
+        o_folded_tensor = rtlsim_output_to_npy(
+            packed_output, None, o_dt, o_folded_shape, o_stream_w, o_dt.bitwidth()
+        )
+        execution_context[o_name] = o_folded_tensor.reshape(o_shape)
+
+    model.set_metadata_prop("cycles_rtlsim", str(n_cycles))
diff --git a/src/finn/core/throughput_test.py b/src/finn/core/throughput_test.py
index 5cf5461..503569f 100644
--- a/src/finn/core/throughput_test.py
+++ b/src/finn/core/throughput_test.py
@@ -121,24 +121,30 @@ def throughput_test_rtlsim(model, batchsize=100):
     ), """Top-level exec_mode
     metadata_prop must be set to rtlsim"""
 
-    # create random input
-    iname = model.graph.input[0].name
-    ishape = model.get_tensor_shape(iname)
-    ishape_batch = ishape
-    ishape_batch[0] = batchsize
-    idt = model.get_tensor_datatype(iname)
-    dummy_input = gen_finn_dt_tensor(idt, ishape_batch)
-    # compute input/output sizes
-    oname = model.graph.output[0].name
-    oshape = model.get_tensor_shape(oname)
-    oshape_batch = oshape
-    oshape_batch[0] = batchsize
-    odt = model.get_tensor_datatype(oname)
-    i_bytes = (np.prod(ishape_batch) * idt.bitwidth()) / 8
-    o_bytes = (np.prod(oshape_batch) * odt.bitwidth()) / 8
-    # make empty exec context and insert input
+    # make empty exec context and insert random inputs
     ctx = model.make_empty_exec_context()
-    ctx[iname] = dummy_input
+    i_bytes = 0
+    for i_vi in model.graph.input:
+        # create random input
+        iname = i_vi.name
+        ishape = model.get_tensor_shape(iname)
+        ishape_batch = ishape
+        ishape_batch[0] = batchsize
+        idt = model.get_tensor_datatype(iname)
+        dummy_input = gen_finn_dt_tensor(idt, ishape_batch)
+        ctx[iname] = dummy_input
+        i_bytes += (np.prod(ishape_batch) * idt.bitwidth()) / 8
+
+    # compute total output size as well
+    o_bytes = 0
+    for o_vi in model.graph.output:
+        oname = o_vi.name
+        oshape = model.get_tensor_shape(oname)
+        oshape_batch = oshape
+        oshape_batch[0] = batchsize
+        odt = model.get_tensor_datatype(oname)
+        o_bytes += (np.prod(oshape_batch) * odt.bitwidth()) / 8
+
     # remove liveness threshold, launch rtlsim
     os.environ["LIVENESS_THRESHOLD"] = "-1"
     rtlsim_exec(model, ctx)
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index 500cae8..13c51dd 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -281,11 +281,14 @@ def npy_to_rtlsim_input(input_file, input_dtype, pad_to_nbits, reverse_inner=Tru
         inp = np.load(input_file)
     else:
         raise Exception("input_file must be ndarray or filename for .npy")
-    packed_data = pack_innermost_dim_as_hex_string(
-        inp, input_dtype, pad_to_nbits, reverse_inner=reverse_inner
-    )
-    packed_data = packed_data.flatten()
-    packed_data = [int(x[2:], 16) for x in packed_data]
+    if inp.shape[-1] == 1 and input_dtype.is_integer():
+        packed_data = inp.flatten().astype(input_dtype.to_numpy_dt())
+    else:
+        packed_data = pack_innermost_dim_as_hex_string(
+            inp, input_dtype, pad_to_nbits, reverse_inner=reverse_inner
+        )
+        packed_data = packed_data.flatten()
+        packed_data = [int(x[2:], 16) for x in packed_data]
     return packed_data
 
 
diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py
index 78e6706..1ec833b 100644
--- a/src/finn/util/pyverilator.py
+++ b/src/finn/util/pyverilator.py
@@ -107,6 +107,7 @@ def rtlsim_multi_io(sim, io_dict, num_out_values, trace_file="", sname="_V_V_"):
                 and _read_signal(sim, outp + sname + "TVALID") == 1
             ):
                 outputs = outputs + [_read_signal(sim, outp + sname + "TDATA")]
+                output_count += 1
             io_dict["outputs"][outp] = outputs
 
         toggle_clk(sim)
@@ -142,7 +143,10 @@ def rtlsim_multi_io(sim, io_dict, num_out_values, trace_file="", sname="_V_V_"):
 
 
 def pyverilate_stitched_ip(
-    model, read_internal_signals=True, disable_common_warnings=True
+    model,
+    read_internal_signals=True,
+    disable_common_warnings=True,
+    extra_verilator_args=[],
 ):
     """Given a model with stitched IP, return a PyVerilator sim object.
     Trace depth is also controllable, see get_rtlsim_trace_depth()
@@ -221,7 +225,7 @@ def file_to_basename(x):
         top_module_name=top_module_name,
         auto_eval=False,
         read_internal_signals=read_internal_signals,
-        extra_args=verilator_args,
+        extra_args=verilator_args + extra_verilator_args,
     )
     return sim