src/cudadecoder/cuda-decoder-kernels.cu

// cudadecoder/cuda-decoder-kernels.cu
//
// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
// Hugo Braun, Justin Luitjens, Ryan Leary
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <cub/cub.cuh>
#include "cuda-decoder-kernels.h"
#include "cuda-decoder-kernels-utils.h"

namespace kaldi {
namespace cuda_decoder {

// Initialize the hashmap with NO_VAL
// Called in InitDeviceData, when building the CudaDecoder object
__global__ void init_hashmap_kernel(DeviceParams cst_dev_params) {
  const int max_nlanes = cst_dev_params.max_nlanes;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, max_nlanes) {
    const int capacity = cst_dev_params.hashmap_capacity;
    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, capacity) {
      cst_dev_params.d_hashmap_values.lane(ilane)[idx] =
          KALDI_CUDA_DECODER_HASHMAP_NO_VAL;
    }
  }
}

// Initialize initial channel on  device
// Called by ComputeInitialChannel
// It is NOT called in InitDecoding
// In InitDecoding we will clone the initial channel into the channel we called
// InitDecoding on
// Here we are actually creating this initial channel
// we do that once in the CudaDecoder constructor.
//
// The initial channel is the state of a channel when
// it will start decoding a new utterance
// thread (1, 1, 1)
// blocks(1, 1, 1);
__global__ void initialize_initial_lane_kernel(DeviceParams cst_dev_params) {
  const int init_ichannel = cst_dev_params.init_channel_id;
  const int init_ilane = 0;
  ChannelCounters *init_channel_counters =
      cst_dev_params.d_channels_counters.channel(init_ichannel);
  LaneCounters *lane_counters =
      cst_dev_params.d_lanes_counters.lane(init_ilane);

  // Making the data look like an ExpandArcsEmitting just executed,
  // and put the StartState in the aux_q. We will then pick up a normal
  // execution from there
  // (calling PruneAndPreprocess, then ExpandArcsNonEmitting..)
  lane_counters->aux_q_end = 0;
  lane_counters->aux_q_requested = 0;
  lane_counters->post_expand_aux_q_end = 1;
  lane_counters->main_q_global_offset = 0;
  lane_counters->main_q_local_offset = 0;
  lane_counters->main_q_n_extra_prev_tokens = 0;
  lane_counters->int_cutoff = INT_MAX;
  lane_counters->main_q_n_emitting_tokens = 0;  // all non emitting
  lane_counters->int_beam = floatToOrderedInt(cst_dev_params.default_beam);
  lane_counters->main_q_narcs_and_end = {0, 0};
  lane_counters->main_q_requested = 0;
  lane_counters->prev_arg_min_int_cost = 0;
  const StateId init_state = cst_dev_params.init_state;
  const CostType init_cost = cst_dev_params.init_cost;
  IntegerCostType int_init_cost = floatToOrderedInt(init_cost);
  cst_dev_params.d_aux_q_state_and_cost.lane(init_ilane)[0] = {init_state,
                                                               int_init_cost};
  lane_counters->min_int_cost = int_init_cost;
  CostType cutoff = orderedIntToFloat(int_init_cost);
  lane_counters->int_cutoff =
      floatToOrderedInt(cutoff + cst_dev_params.default_beam);
  cst_dev_params.d_aux_q_info.lane(init_ilane)[0] = {INT_MIN, -1};
}

// Called by InitDecoding
// Called when some channels will start decoding a new utterance
// do everything that's needed to do on the device to start decoding a new
// utterance with those channels
// It clones the initial channel (created in initialize_initial_lane_kernel)
// into the channels we want to InitDecoding on
__global__ void init_decoding_on_device_kernel(DeviceParams cst_dev_params,
                                               KernelParams params) {
  const int init_ichannel = cst_dev_params.init_channel_id;

  const ChannelCounters *init_channel_counters =
      cst_dev_params.d_channels_counters.channel(init_ichannel);
  const int32 init_main_q_end =
      init_channel_counters->prev_main_q_narcs_and_end.y;
  const int32 nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, init_main_q_end) {
      const LaneCounters *lane_counters =
          cst_dev_params.d_lanes_counters.lane(ilane);
      const int32 ichannel = lane_counters->channel_to_compute;
      cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[idx] =
          cst_dev_params.d_main_q_state_and_cost.channel(init_ichannel)[idx];
      cst_dev_params.d_main_q_degrees_prefix_sum.channel(ichannel)[idx] =
          cst_dev_params.d_main_q_degrees_prefix_sum.channel(
              init_ichannel)[idx];
      cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[idx] =
          cst_dev_params.d_main_q_arc_offsets.channel(init_ichannel)[idx];
      if (idx == 0) {
        ChannelCounters *channel_counters =
            cst_dev_params.d_channels_counters.channel(ichannel);
        channel_counters->prev_main_q_narcs_and_end =
            init_channel_counters->prev_main_q_narcs_and_end;
        channel_counters->prev_main_q_n_extra_prev_tokens =
            init_channel_counters->prev_main_q_n_extra_prev_tokens;
        channel_counters->prev_main_q_global_offset = 0;
        channel_counters->prev_main_q_extra_prev_tokens_global_offset = 0;
        channel_counters->prev_beam = cst_dev_params.default_beam;
      }
    }
  }
}

// Context switch : load
// Called by LoadChannelsStateToLanes
// THREADS : (1, 1, 1)
// BLOCKS : (1, nlanes_used, 1)
__global__ void load_channels_state_in_lanes_kernel(DeviceParams cst_dev_params,
                                                    KernelParams params) {
  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
    const int32 ichannel = lane_counters->channel_to_compute;
    const ChannelCounters *channel_counters =
        cst_dev_params.d_channels_counters.channel(ichannel);
    int2 main_q_narcs_and_end = channel_counters->prev_main_q_narcs_and_end;
    lane_counters->main_q_narcs_and_end = main_q_narcs_and_end;
    lane_counters->main_q_n_extra_prev_tokens =
        channel_counters->prev_main_q_n_extra_prev_tokens;
    CostType beam = channel_counters->prev_beam;
    IntegerCostType int_beam = floatToOrderedInt(beam);
    lane_counters->int_beam = int_beam;
    lane_counters->adaptive_int_beam_with_validity_index.x = int_beam;
    lane_counters->adaptive_int_beam_with_validity_index.y =
        cst_dev_params.adaptive_beam_static_segment;
    lane_counters->main_q_global_offset =
        channel_counters
            ->prev_main_q_global_offset;  // we'll update it after emitting
    lane_counters->main_q_extra_prev_tokens_global_offset =
        channel_counters->prev_main_q_extra_prev_tokens_global_offset;
  }
}

// Context switch : store
// Called by SaveChannelsStateFromLanes
// THREADS : (1, 1, 1)
// BLOCKS : (1, nchannel_to_compute, 1)
__global__ void save_channels_state_from_lanes_kernel(
    DeviceParams cst_dev_params, KernelParams params) {
  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    const LaneCounters *lane_counters =
        cst_dev_params.d_lanes_counters.lane(ilane);
    const int32 ichannel = lane_counters->channel_to_compute;
    ChannelCounters *channel_counters =
        cst_dev_params.d_channels_counters.channel(ichannel);
    channel_counters->prev_main_q_global_offset =
        lane_counters->main_q_global_offset;
    channel_counters->prev_main_q_extra_prev_tokens_global_offset =
        lane_counters->main_q_extra_prev_tokens_global_offset;
    channel_counters->prev_main_q_narcs_and_end =
        lane_counters->main_q_narcs_and_end;
    channel_counters->prev_main_q_n_extra_prev_tokens =
        lane_counters->main_q_n_extra_prev_tokens;
    channel_counters->prev_beam = orderedIntToFloat(lane_counters->int_beam);
  }
}

// compute_lane_offsets_kernel
// the kernel concatenate_lanes_data concatenates multiple array into a single
// continuous array
// compute_lane_offsets_kernel computes the offset of each array into this
// continous array
// This kernel is 1D : the lanes are on the X dimension, because we want to
// compute the offset of those lanes
__global__ void compute_lane_offsets_kernel(DeviceParams cst_dev_params,
                                            KernelParams params) {
  typedef cub::BlockScan<int4, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
  __shared__ typename BlockScan::TempStorage temp_storage;

  const int nlanes = params.nlanes_used;
  int4 sum_so_far = {0, 0, 0, 0};
  KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(
      block_offset, thread_idx,
      nlanes + 1) {  // +1 because we are doing an exclusive sum, and we want
                     // all the values
    int32 ilane = block_offset + thread_idx;
    int4 zero4 = {0, 0, 0, 0};
    int4 lane_offsets = zero4;
    if (ilane < nlanes) {  // nlanes, not nlanes+1, because we cannot read +1
                           // values (undefined)
      LaneCounters *d_lane_counters =
          cst_dev_params.d_lanes_counters.lane(ilane);
      int32 main_q_end = d_lane_counters->main_q_narcs_and_end.y;
      int32 n_emitting_tokens = d_lane_counters->main_q_n_emitting_tokens;
      int32 main_q_n_extra_prev_tokens =
          d_lane_counters->main_q_n_extra_prev_tokens;
      lane_offsets = {main_q_end, n_emitting_tokens, main_q_n_extra_prev_tokens,
                      0};
    }
    int4 block_aggregate;
    BlockScan(temp_storage)
        .ExclusiveScan(lane_offsets, lane_offsets, zero4, PlusPlusPlusPlus(),
                       block_aggregate);
    PlusPlusPlusPlus pppp;
    lane_offsets = pppp(lane_offsets, sum_so_far);
    sum_so_far = pppp(sum_so_far, block_aggregate);
    if (ilane < (nlanes + 1)) {  // nlanes+1, to write the output
      LaneCounters *d_lane_counters =
          cst_dev_params.d_lanes_counters.lane(ilane);
      LaneCounters *h_lane_counters =
          cst_dev_params.h_lanes_counters.lane(ilane);
      h_lane_counters->main_q_end_lane_offset =
          d_lane_counters->main_q_end_lane_offset = lane_offsets.x;
      h_lane_counters->main_q_n_emitting_tokens_lane_offset =
          d_lane_counters->main_q_n_emitting_tokens_lane_offset =
              lane_offsets.y;
      h_lane_counters->main_q_n_extra_prev_tokens_lane_offset =
          d_lane_counters->main_q_n_extra_prev_tokens_lane_offset =
              lane_offsets.z;
    }
    __syncthreads();  // reusing temp_storage
  }
}

// concatenate_lanes_data
// Called by PerformConcatenatedCopy
// Creates a concatenate array into concat,
// by concatenating all the arrays src.lane(ilane)
// for ilane=0..params.nlanes_used
// Used to prepare data for copy to Host. We want to avoid small Device2Host
// copies.
template <typename T>
__global__ void concatenate_lanes_data_kernel(DeviceParams cst_dev_params,
                                              KernelParams params,
                                              LaneMatrixView<T> src, T *concat,
                                              int32 *lane_offsets) {
  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    const int32 stride =
        sizeof(LaneCounters) / sizeof(int32);  // offsets are in LaneCounters
    int32 beg = *(lane_offsets + ilane * stride);
    int32 end = *(lane_offsets + (ilane + 1) * stride);
    int32 vec_size = end - beg;
    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, vec_size) {
      T d = src.lane(ilane)[idx];
      concat[beg + idx] = d;
    }
  }
}

// nonemitting_preprocess_and_contract_kernel
// Called from PruneAndPreprocess
// This kernels prune the aux_q, move the survival tokens to the main_q,
// and add the preprocessing information necessary for the next ExpandArcs
// (the expand that follows PruneAndPreprocess is always non-emitting)
// It prunes the tokens using the cutoff, and prepare the data necessary for
// ExpandArcs:
// d_main_q_degrees_prefix_sum, d_main_q_arc_offsets_
// The prefix sum is done in one-pass here, using a trick (we compute the prefix
// sum
// as we fill the main_q)
__global__ void nonemitting_preprocess_and_contract_kernel(
    DeviceParams cst_dev_params, KernelParams params) {
  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
  __shared__ typename BlockScan::TempStorage sh_temp_storage;
  // We need to move the survival tokens to the main_q
  //
  // sh_main_q_global_block_offset has two purposes :
  // (1) to know where to store the survival tokens in the main_q
  // (2) to perform the prefix sum degrees (of the survival tokens)
  __shared__ int2 sh_main_q_global_block_offset;
  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
    const int32 aux_q_end = lane_counters->post_expand_aux_q_end;
    const IntegerCostType int_cutoff = lane_counters->int_cutoff;
    // Keeping whole CTA alive. We'll use __syncthreads()
    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(block_offset, thread_idx,
                                                   aux_q_end) {
      const int32 aux_q_idx = block_offset + thread_idx;
      const int32 ichannel = lane_counters->channel_to_compute;
      int32 degree = 0;
      int32 arc_start = -1;
      StateId token_state;
      IntegerCostType token_int_cost;
      // We've kept the whole CTA alive. Now we keep only those will a valid
      // token
      if (aux_q_idx < aux_q_end) {
        const int2 both =
            cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[aux_q_idx];
        token_state = both.x;
        token_int_cost = both.y;

        if (token_int_cost < int_cutoff) {
          // We'll keep that token. Loading its arc degree/csr offset now.
          arc_start = cst_dev_params.d_arc_ne_offsets[token_state];
          const int32 arc_end =
              cst_dev_params.d_arc_ne_offsets[token_state + 1];
          degree = arc_end - arc_start;
        }
      }

      // If we've set a different arc_start,
      // this thread has a valid unpruned token
      int32 is_pruned = (arc_start == -1);

      // We now know which tokens will be moved to the main_q, the remaining
      // will be pruned
      // we now compute a prefix sum inside the CUDA block to determine the
      // local indexes of the unpruned tokens
      // the first unpruned token will have a index of 0, the second 1, ...
      // We also need to compute the prefix sum of the arc degrees
      // we start by doing a local prefix sum inside the CUDA block
      int2 block_prefix_sum_narcs_and_end = {degree, (is_pruned ? 0 : 1)};
      const int2 zero2 = {0, 0};

      // Computing the prefix sum (exclusive)
      BlockScan(sh_temp_storage)
          .ExclusiveScan(block_prefix_sum_narcs_and_end,
                         block_prefix_sum_narcs_and_end, zero2, PlusPlus());

      if (KALDI_CUDA_DECODER_IS_LAST_1D_THREAD()) {
        // This conditional branch is entered by the last thread
        // Because it is the last, the prefix_sum of that thread contains the
        // sum of all elements

        // We also add the value from this thread - the prefix sum is exclusive
        // For the sum, we want it inclusive
        int2 block_sum = block_prefix_sum_narcs_and_end;
        block_sum.x += degree;
        block_sum.y += is_pruned ? 0 : 1;

        // Doing two things at the same time :
        // requesting a spot in the main_q to store the survival tokens from
        // this CTA
        // We also increment the narcs value. atomic64.x will contain the number
        // of
        // arcs in the main_q up until the atomic64.y index
        // That's all we need to finish our prefix sum. We add this global
        // offset.

        // First atomic to check if we are not overflowing main_q.
        int block_offset =
            atomicAdd(&lane_counters->main_q_requested, block_sum.y);

        // Verify that we do not overflow
        if (block_offset + block_sum.y < cst_dev_params.main_q_capacity) {
          // we don't overflow we can safely grab a spot in the main_q
          sh_main_q_global_block_offset =
              atomicAddI2(&lane_counters->main_q_narcs_and_end, block_sum);
        } else {
          // our update would overflow
          lane_counters->q_overflow |= OVERFLOW_MAIN_Q;  // for the host
          sh_main_q_global_block_offset.y =
              cst_dev_params.main_q_capacity;  // used as flag to broadcast the
                                               // information in the CTA
        }
      }

      // Syncing because :
      // - Broadcasting sh_main_q_global_block_offset
      // - We may reuse sh_temp_storage (cf CUB doc)
      __syncthreads();

      // Checking if we are overflowing the main_q
      // All threads are executing the next line
      if (sh_main_q_global_block_offset.y == cst_dev_params.main_q_capacity)
        goto end_lane;  // done for this lane

      // If we are executing the following lines it means that we are not
      // overflowing the queue
      // We then continue what we were doing
      if (!is_pruned) {
        bool moving_emitting_tokens = (lane_counters->main_q_local_offset == 0);
        // we will move our unpruned token to the main_q, at index main_q_idx
        InfoToken tok_info = cst_dev_params.d_aux_q_info.lane(ilane)[aux_q_idx];
        const int32 main_q_idx =
            sh_main_q_global_block_offset.y + block_prefix_sum_narcs_and_end.y;
        CostType acoustic_cost = 0.0f;
        if (moving_emitting_tokens && tok_info.arc_idx != -1) {
          const int32 arc_ilabel =
              cst_dev_params.d_arc_pdf_ilabels[tok_info.arc_idx];
          acoustic_cost = -lane_counters->loglikelihoods[arc_ilabel];
        }
        cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx] = tok_info;

        // Moving the token to the main q
        cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx] = {
            token_state, token_int_cost};
        cst_dev_params.d_main_q_acoustic_cost.lane(ilane)[main_q_idx] =
            acoustic_cost;
        // Saving the global prefix sum
        const int32 prefix_sum_narcs =
            sh_main_q_global_block_offset.x + block_prefix_sum_narcs_and_end.x;
        cst_dev_params.d_main_q_degrees_prefix_sum.channel(
            ichannel)[main_q_idx] = prefix_sum_narcs;
        // Saving the CSR arc offset for that token's state
        // it will be used by the expand kernel, and avoid doing a new random
        // memory access in the expand kernel
        cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx] =
            arc_start;
      }
    }

  end_lane:;  // empty statement
  }
}

// GetAdaptiveBeam is used in ExpandArcs
// When we generate new tokens by traversing arcs, 
// we can end up creating a lot of tokens, if the current frame 
// generated loglikelihoods too uniform for instance (we don't have
// any good tokens that will reduce the cutoff, so we end up generating
// a lot of tokens)
// To avoid overflowing the aux_q, we apply a decreasing beam.
// With aux_q_end being the current aux_q size, we have a decrease function f, with
// adaptive_beam = f(aux_q_end)
// f is a decreasing piecewise constant function
// Please note that when processing tokens, we usually have dozens of thousands of threads
// generating tokens. Those are already in flight, and will not reload the beam immediatly.
// It means that we need to start reducing the beam as soon as we detect that we are generating more tokens than
// expected. 
// We can configure the function f using KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
// and KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS.
// We will use default_beam for the first max_tokens_per_frame/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
// tokens in the aux_q.
// Once we reach that number, we will decrease the adaptive beam linearly from default_beam to 0,
// using KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS steps
//
// x-axis : aux_q_end. How much tokens are already in the aux_q
// y-axis : adaptive_beam = f(aux_q_end)
// default_beam _| ________________
//               |               /\ _________
//               |                |          _________
//            0 _|   static_segment                   _________
//               |________________________________________________
//               |                                             |     
//   aux_q_end=  0                                    max_tokens_per_frame
// We have :     
// static_segment = max_tokens_per_frame/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
// and KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS = 3
__device__ void UpdateAdaptiveBeam(const DeviceParams &cst_dev_params,
                                   const int aux_q_index_block_offset,
                                   IntegerCostType min_int_cost,
                                   int2 *adaptive_int_beam_with_validity_index,
                                   LaneCounters *lane_counters) {
  int32 beam_valid_until_idx = adaptive_int_beam_with_validity_index->y;
  if (aux_q_index_block_offset < beam_valid_until_idx) return;  // nothing to do

  CostType beam = orderedIntToFloat(adaptive_int_beam_with_validity_index->x);
  while (aux_q_index_block_offset >= beam_valid_until_idx) {
    beam /= 2;
    beam_valid_until_idx += cst_dev_params.adaptive_beam_bin_width;
  }

  IntegerCostType new_int_cutoff = (min_int_cost < INT_MAX)
      ? floatToOrderedInt(orderedIntToFloat(min_int_cost) + beam)
      : INT_MAX;
  IntegerCostType int_beam = floatToOrderedInt(beam);
  adaptive_int_beam_with_validity_index->x = int_beam;
  adaptive_int_beam_with_validity_index->y = beam_valid_until_idx;
  // We can have races between the two atomics
  // However the worst than can happen is a CTA might delay updating the beam
  // This is not a critical bug. However, once we have a floatToOrderedInt
  // that is generating unsigned ints, we could merge the two atomics into a
  // single atomic64
  atomicMin(&lane_counters->adaptive_int_beam_with_validity_index.x, int_beam);
  atomicMax(&lane_counters->adaptive_int_beam_with_validity_index.y,
            beam_valid_until_idx);
  atomicMin(&lane_counters->int_cutoff, new_int_cutoff);
}

// One CTA / lane
__global__ void reset_for_frame_and_estimate_cutoff_kernel(
    DeviceParams cst_dev_params, KernelParams params) {
  typedef cub::BlockReduce<CostType, KALDI_CUDA_DECODER_1D_BLOCK> BlockReduce;
  __shared__ typename BlockReduce::TempStorage temp_storage;

  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
    const int32 ichannel = lane_counters->channel_to_compute;
    ChannelCounters *channel_counters =
        cst_dev_params.d_channels_counters.channel(ichannel);
    if (threadIdx.x == 0) {
      const CostType current_beam = orderedIntToFloat(lane_counters->int_beam);
      // Do some initialization
      lane_counters->q_overflow = OVERFLOW_NONE;
      lane_counters->main_q_n_emitting_tokens = INT_MAX;
      lane_counters->int_cutoff = INT_MAX;
      lane_counters->min_int_cost = INT_MAX;
      lane_counters->q_overflow = OVERFLOW_NONE;
      lane_counters->int_relative_cost = INT_MAX;
      lane_counters->aux_q_requested = 0;
      lane_counters->main_q_requested = 0;
      lane_counters->main_q_local_offset = 0;
      lane_counters->compute_max_active =
          false;  // will be set to true if necessary
      channel_counters->min_int_cost_and_arg_with_final.x =
          INT_MAX;  // it will be set with atomicMins
      const CostType new_beam =
          fmin(cst_dev_params.default_beam,
               current_beam * KALDI_CUDA_DECODER_ADAPTIVE_BEAM_RECOVER_RATE);
      lane_counters->int_beam = floatToOrderedInt(new_beam);
    }
    const int32 prev_arg_min = lane_counters->prev_arg_min_int_cost;
    int2 both =
        cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[prev_arg_min];
    int32 int_cost = both.y;
    CostType previous_cost = orderedIntToFloat(int_cost);
    const int32 prev_arg_min_state = both.x;
    int32 arc_start = cst_dev_params.d_arc_e_offsets[prev_arg_min_state];
    int32 arc_end = cst_dev_params.d_arc_e_offsets[prev_arg_min_state + 1];
    int32 narcs = arc_end - arc_start;
    // no loop - we only process the first KALDI_CUDA_DECODER_1D_BLOCK arcs
    // we just want an estimate
    CostType total_cost = FLT_MAX;
    if (threadIdx.x < narcs) {
      int32 iarc = arc_start + threadIdx.x;
      CostType arc_fixed_cost = cst_dev_params.d_arc_weights[iarc];
      const int32 arc_ilabel = cst_dev_params.d_arc_pdf_ilabels[iarc];
      CostType acoustic_cost = -lane_counters->loglikelihoods[arc_ilabel];
      total_cost = previous_cost + arc_fixed_cost +
                   acoustic_cost;  // +0.0f, best prev cost is normalized to 0
    }

    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(bin_id, KALDI_CUDA_DECODER_HISTO_NBINS) { 
      cst_dev_params.d_histograms.lane(ilane)[bin_id] = 0; // reset for this frame
    }

    CostType min = BlockReduce(temp_storage).Reduce(total_cost, cub::Min());
    if (narcs > 0 && threadIdx.x == 0) {
      // narcs > 0 to have at least one valid element in the reduce
      CostType new_cutoff = min + orderedIntToFloat(lane_counters->int_beam);
      IntegerCostType new_int_cutoff = floatToOrderedInt(new_cutoff);
      lane_counters->int_cutoff = new_int_cutoff;
      lane_counters->min_int_cost = floatToOrderedInt(min);
    }
  }
}
// ExpandArc kernel
// This kernel does the actual work of traversing arcs
//
// Pseudo code :
// for all token tok in main_q[main_q_offset...end]:
//      u = tok.next_state
//      for all arc a(u->v) in the FST:
//          v_cost = tok.cost + a.cost + accoustic_cost
//
//          if v_cost < cutoff and v_cost < best_state_cost[v]
//              generate token associated to v, add to aux_q
//              if necessary update cutoff
//              if aux_q is getting full, reduce beam
//
// For more information please refer to http://kaldi-asr.org/doc/decoders.html
//
// ExpandArc rely on some preprocessed data to be able to function
// for instance, it needs the prefix sum of the arc degree of all token.state in
// the main_q
// We need to call a Preprocess kernel before ExpandArc
//
// ExpandArc is used for both emitting and nonemitting phases
// Differences between emitting and nonemitting :
//      1) params.d_q_arc_offset contains offsets to either emitting or
//      nonemitting arcs.
//         It is transparent for this kernel. The differentiation was done in
//         the Preprocess kernel,
//         which is responsible for filling the params.d_q_arc_offset array
//      2) Computation of the acoustic cost. If nonemitting, it is equal to 0.
//      If emitting, we need
//         to use values from the acoustic model (through the d_loglikelihoods
//         array)
//
// Note : ExpandArc is not the only kernel able to traverse arcs.
// FinalizeProcessNonemitting contains a simplified version of expand for only
// one CUDA block
template <bool IS_EMITTING>
__global__ void expand_arcs_kernel(DeviceParams cst_dev_params,
                                   KernelParams params) {
  // BlockScan that we will use to compute token indexes in the output queue,
  // and to find the min cost in the block
  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
  __shared__ typename BlockScan::TempStorage sh_temp_storage_scan;

  // This kernel writes the new token to the output queue aux_q
  // We will request a spot to store all the new tokens created by threads in
  // this CUDA block
  // sh_aux_q_index_block_offset indicates where to store them in the aux_q
  // tokens created in this CUDA block will be store in :
  // aux_q[sh_aux_q_index_block_offset], aux_q[sh_aux_q_index_block_offset + 1],
  __shared__ int32 sh_aux_q_index_block_offset;
  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
    const int32 main_q_offset = lane_counters->main_q_local_offset;
    const int32 main_q_end = lane_counters->main_q_narcs_and_end.y;
    const int32 total_narcs = lane_counters->main_q_narcs_and_end.x;
    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(block_offset, thread_idx,
                                                   total_narcs) {
      int2 adaptive_int_beam_with_validity_index =
          lane_counters->adaptive_int_beam_with_validity_index;
      const int32 ichannel = lane_counters->channel_to_compute;
      // Important : this thread is not responsible for a token in the input
      // queue main_q
      // but for an arc, going out of a token in the main_q
      // The main_q contains in total total_narcs
      // and this thread will compute the main_q_arc_index-th arc of the main_q
      // For instance, first thread in the grid with threadIdx.x == 0 and
      // blockIdx.x == 0
      // will process the first arc of the token in main_q[main_q_offset + 0]
      // (if that token has at least one arc)
      //
      // This insure a perfect one thread = one arc load balancing
      // but we have work to do to know exactly which arc is the
      // main_q_arc_index-th arc
      // (what's its source ? its destination ? its arc_idx the FST CSR ?)
      int32 main_q_arc_index = block_offset + thread_idx;
      // We'll need those variables later in the kernel
      // we declare them outside of the "valid_input" scope
      // to be able to access them later
      int32 main_q_idx;
      int32 arc_idx;
      StateId arc_next_state;
      IntegerCostType int_total_cost = INT_MAX;
      if (main_q_arc_index < total_narcs) {
        // Current thread must take care of main_q_arc_index-th arc
        // we need to now what's the source of that arc
        // ie which token.state in main_q does it start from ?
        // We use a binary search in the prefix sum of the token's degree to get
        // that information
        //
        // Example : main_q contains 3 tokens
        // - First token is associated to a state which has 3 outgoing arc
        // - Second token is associated to a state which has 0 outgoing arc
        // - Third token is associated to a state which has 2 outgoing arc
        //
        // We store the degrees in an array :
        // [3, 0, 2]
        //
        // We then compute the exclusive prefix sum of that array :
        // [0, 3, 3, 5]
        //
        // In total, we have 5 arcs in the main_q. ExpandArc will use 5 threads.
        //
        // Let's say we are the fifth thread in ExpandArc.
        // we have threadIdx.x == 4, and blockIdx.x == 0
        // it gives us main_q_arc_index == 4
        // From there we have no idea what we're supposed to do next, we need to
        // have information about the
        // arc that we're supposed to traverse
        //
        // To do that, we look for the maximum index maxle_i in the prefix sum
        // array such prefix_sum[i] <= 4
        //
        // [0, 3, 3, 5]
        //          |
        //         here
        // maxle_i = 2
        // it means that our source token is at index 2 in the main_q
        // and we are computing the arc at index (main_q_arc_index -
        // prefix_sum[maxle_i]) of that token
        // ie the arc at index (4-3) = 1, the second arc of the second token in
        // main_q

        // Searching for the source of the arc that we will process
        // (main_q_arc_index)
        // we could preprocess the search in the preprocess kernels - for now
        // this kernel is fast enough
        const int32 *degrees_prefix_sum =
            cst_dev_params.d_main_q_degrees_prefix_sum.channel(ichannel);
        main_q_idx = binsearch_maxle(degrees_prefix_sum, main_q_arc_index,
                                     main_q_offset, main_q_end - 1);

        // state_first_arc_idx_in_main_q
        // d_main_q_degrees_prefix_sum contains the prefix sum of the
        // degrees of all tokens in the main_q
        // d_main_q_degrees_prefix_sum[main_q_idx] contains the number of arc
        // in the main_q until that token
        const int32 state_first_arc_idx_in_main_q =
            degrees_prefix_sum[main_q_idx];

        // arc_offset_start is the offset in the CSR, to find the arcs
        // related to the state main_q_state_[main_q_idx]
        // it was set by the preprocess kernel
        const int32 arc_offset_start =
            cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx];

        // local_arc_index is the arc index for that state
        // if local_arc_index == 2, we will process the second arc
        // of state main_q_state_[main_q_idx]
        const int32 local_arc_index =
            main_q_arc_index - state_first_arc_idx_in_main_q;

        // corresponding arc_idx in the FST
        arc_idx = arc_offset_start + local_arc_index;

        // Destination of that arc
        arc_next_state = cst_dev_params.d_arc_nextstates[arc_idx];

        // Building the total cost incrementally
        // we'll add the acoustic cost and the old token's cost
        const CostType arc_fixed_cost = cst_dev_params.d_arc_weights[arc_idx];
        const CostType prev_token_cost = orderedIntToFloat(
            cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx]
                .y);
        CostType total_cost = prev_token_cost + arc_fixed_cost;
        const int32 prev_state =
            cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx]
                .x;
        if (IS_EMITTING) {
          const int32 arc_ilabel = cst_dev_params.d_arc_pdf_ilabels[arc_idx];
          CostType acoustic_cost = -lane_counters->loglikelihoods[arc_ilabel];
          total_cost += acoustic_cost;
        }
        int_total_cost = floatToOrderedInt(total_cost);

        // If the total_cost is too large compared to our cutoff (beam search)
        // then let's drop it
        const IntegerCostType int_cutoff = lane_counters->int_cutoff;
        if (int_total_cost >= int_cutoff) int_total_cost = INT_MAX;
      }

      // If int_total_cost < INT_MAX, it means that :
      // - this thread had a valid input (main_q_arc_index < total_narcs)
      // - the total_cost of the generated token is < cutoff
      // We will then add that new token in the output queue, aux_q
      // We need to know where to put that token in the aux_q
      // we'll first compute its index inside the CUDA block
      // the first valid output token in the CUDA block will have index 0,
      // the second index 1... We compute that using a prefix sum
      //
      // We also need to find the overall min cost in the CUDA block
      // a prefix sum is a scan operation, and a min a reduce operation
      // we can perform a reduce operation using a scan (using the last value)
      // we compute the prefix sum and the min in one scan, using the data
      // struct CostTypeAndInt
      const int32 has_successor = (int_total_cost < INT_MAX) ? 1 : 0;

      int2 int_cost_and_index = {int_total_cost, has_successor};
      BlockScan(sh_temp_storage_scan)
          .InclusiveScan(int_cost_and_index, int_cost_and_index, MinPlus());
      if (KALDI_CUDA_DECODER_IS_LAST_1D_THREAD()) {
        // We are in a divergent branch
        // This is the last thread. The last value of the inclusive scan is the
        // total
        const int32 total_successors_in_block = int_cost_and_index.y;
        // Requesting a spot of size total_successors_in_block in the aux_q

        // note:  using 2 atomics here to avoid adding another kernel
        // first request more space
        const int aux_q_index_block_offset = atomicAdd(
            &lane_counters->aux_q_requested, total_successors_in_block);

        // check for overflow in aux_q
        // We try to prevent an overflow from happening using an adaptive beam
        // (cf GetAdaptiveBeam)
        if (aux_q_index_block_offset + total_successors_in_block <
            cst_dev_params.aux_q_capacity) {
          // no overflow

          // grab the aux_q offset
          sh_aux_q_index_block_offset =
              atomicAdd(&lane_counters->aux_q_end, total_successors_in_block);

          // We are not overflowing the queue, updating the global values
            IntegerCostType global_min_int_cost = lane_counters->min_int_cost;
            IntegerCostType local_min_int_cost = int_cost_and_index.x;
            // if we found a lower min_cost, update the global value
            if (local_min_int_cost < global_min_int_cost) {
              global_min_int_cost = local_min_int_cost;
              atomicMin(&lane_counters->min_int_cost, global_min_int_cost);
              CostType beam =
                  orderedIntToFloat(adaptive_int_beam_with_validity_index.x);
              IntegerCostType new_int_cutoff = floatToOrderedInt(
                  orderedIntToFloat(local_min_int_cost) + beam);
              atomicMin(&lane_counters->int_cutoff, new_int_cutoff);
            }
            int32 beam_valid_until_idx =
                adaptive_int_beam_with_validity_index.y;
            if (aux_q_index_block_offset >= beam_valid_until_idx) {
              // This beam is no longer valid. Updating it
              UpdateAdaptiveBeam(
                  cst_dev_params, aux_q_index_block_offset, global_min_int_cost,
                  &adaptive_int_beam_with_validity_index, lane_counters);
            }
        } else {
          // sh_aux_q_index_block_offset is in shared memory
          // its value is currently invalid (overflow)
          // we set it to a special value and use it as a flag to broadcast
          // the fact that we have an overflow and that all threads should exit
          sh_aux_q_index_block_offset = cst_dev_params.aux_q_capacity;

          // Setting the flag for the host. It will be used to print a warning
          // to stderr
          lane_counters->q_overflow |= OVERFLOW_AUX_Q;

          // We do not jump to end_lane now, because only
          // the first thread (threadIdx.x == 0) is executing this
          // We wait until the end of the divergent branch
        }
      }

      // Sync'ing for two reasons :
      // - Broadcasting sh_aux_q_index_block_offset
      // - reusing sh_temp_storage (cf CUB's doc)
      __syncthreads();
      // The only case where we can have that condition met,
      // is if we detected an overflow if the previous lines
      if (sh_aux_q_index_block_offset == cst_dev_params.aux_q_capacity)
        goto end_lane;  // done for this lane
      //
      // If we're executing the following lines it means everything
      // is valid and we are not overflowing the aux_q
      //
      int_cost_and_index.y -= has_successor;  // we want the exclusive sum now
      const int32 aux_q_block_index = int_cost_and_index.y;
      const int32 aux_q_index = sh_aux_q_index_block_offset + aux_q_block_index;
      if (has_successor) {
        // We save the new token to the aux_q
        cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[aux_q_index] = {
            arc_next_state, int_total_cost};
        // Index of the parent token
        // the parent is the token used as input (source of arc)
        // that parent is at index main_q_idx in the GPU memory
        // However, the main_q is emptied before processing a new frame
        // we need to add the offset related to the previous frames index
        // we add cst_dev_params.main_q_global_offset
        const int32 prev_token =
            lane_counters->main_q_global_offset + main_q_idx;
        assert(main_q_idx >= 0 && main_q_idx < cst_dev_params.main_q_capacity);
        cst_dev_params.d_aux_q_info.lane(ilane)[aux_q_index] = {prev_token,
                                                                arc_idx};
      }
    }
  end_lane:;  // ";" is an empty statement
  }
}

// post_expand_kernel
// Called after expand_arcs_kernel
// Takes care of what needs to be done after an expand_arcs_kernel
// execution. Mostly resetting the beam (if adaptive beam was triggered,
// the max_active_ kernels will take care of selecting a good beam),
// resetting the number of arcs in the main_q (we've processed them all),
// etc.
// Threads (1,1,1)
// Blocks (1, nlanes_used, 1)
template <bool IS_EMITTING>
__global__ void post_expand_kernel(DeviceParams cst_dev_params,
                                   KernelParams params) {
  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
    LaneCounters *h_lane_counters = cst_dev_params.h_lanes_counters.lane(ilane);
    const int prev_main_q_end = lane_counters->main_q_narcs_and_end.y;
    const int prev_n_extra_prev_tokens =
        lane_counters->main_q_n_extra_prev_tokens;
    const int aux_q_end = lane_counters->aux_q_end;
    CostType min_cost = orderedIntToFloat(lane_counters->min_int_cost);
    // The next step is the contracting step from aux_q to main_q
    // It will need the aux_q_end value. But it will also empty the aux_q
    // We're resetting aux_q_end to 0 now, but we're saving its old value
    // in another place
    lane_counters->post_expand_aux_q_end = aux_q_end;
    h_lane_counters->post_expand_aux_q_end = aux_q_end;       // pinned memory
    h_lane_counters->q_overflow = lane_counters->q_overflow;  // pinned memory
    lane_counters->aux_q_end = 0;
    lane_counters->aux_q_requested = 0;
    // We are done processing those arcs
    lane_counters->main_q_narcs_and_end.x = 0;
    // Resetting the adaptive beam
    lane_counters->adaptive_int_beam_with_validity_index.x =
        lane_counters->int_beam;
    lane_counters->adaptive_int_beam_with_validity_index.y =
        cst_dev_params.adaptive_beam_static_segment;
    CostType beam = orderedIntToFloat(lane_counters->int_beam);
    lane_counters->int_cutoff = floatToOrderedInt(min_cost + beam);
    // If the adaptive beam kicked in, we want to reset the beam
    // the max-active process will take care of selecting the right beam
    if (IS_EMITTING) {
      // the main_q contains the tokens from the previous frame
      // after emitting, we won't use them anymore to create new tokens
      // we reset the main_q
      lane_counters->main_q_narcs_and_end = {0, 0};
      lane_counters->main_q_requested = 0;
      // The main_q was flushed - we need to update the global_offset
      lane_counters->main_q_global_offset += prev_main_q_end;
      if (threadIdx.x == 0 && blockIdx.x == 0)
        lane_counters->main_q_extra_prev_tokens_global_offset +=
            prev_n_extra_prev_tokens;
      // Moving local offset. Tokens created by last expand
      // will be pruned, and survivals will be moved at the end
      // of the main q. Those tokens will be placed after local_offset
      lane_counters->main_q_requested = 0;
      CostType min_cost = orderedIntToFloat(lane_counters->min_int_cost);
      lane_counters->min_histo_cost = min_cost;
      lane_counters->max_histo_cost = min_cost + beam;
      lane_counters->histo_bin_width = beam / (KALDI_CUDA_DECODER_HISTO_NBINS-1);
    } else {
      lane_counters->main_q_local_offset = prev_main_q_end;
      // reset requested to end of queue
      lane_counters->main_q_requested = prev_main_q_end;
    }
  }
}

__global__ void post_contract_and_preprocess_kernel(DeviceParams cst_dev_params,
                                                    KernelParams params) {
  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
    LaneCounters *h_lane_counters = cst_dev_params.h_lanes_counters.lane(ilane);
    int2 main_q_narcs_and_end = lane_counters->main_q_narcs_and_end;
    h_lane_counters->main_q_narcs_and_end =
        main_q_narcs_and_end;                                 // pinned memory
    h_lane_counters->q_overflow = lane_counters->q_overflow;  // pinned memory
    atomicMin(&lane_counters->main_q_n_emitting_tokens, main_q_narcs_and_end.y);
  }
}

// Meta-kernel (merging preprocess and expand) but only works with 1 CUDA block
// Used to avoid calling multiple main kernels (such as expand_arcs_kernel)
// for the tail of non emitting (lots of iterations with small number of arcs)
//
// Code is greatly simplified because we use only one CTA / lane
//
// Repeat until new queue empty:
// 1) Preprocess
// 2) Expand arcs
//
// The preprocess stage is not done on the first iteration, because it was
// already done by the ProcessAndContract kernel. We always call
// PruneAndPreprocess before calling FinalizeProcessNonemitting
//
// At the end, this kernel finalize the computation for current frame,
// so that it's ready for next ProcessEmitting
//
// This kernel works, but can be greatly simplified now.
__launch_bounds__(KALDI_CUDA_DECODER_LARGEST_1D_BLOCK, 1) __global__
    void finalize_process_non_emitting_kernel(DeviceParams cst_dev_params,
                                              KernelParams params) {
  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_LARGEST_1D_BLOCK>
      Int2BlockScan;
  typedef cub::BlockScan<int, KALDI_CUDA_DECODER_LARGEST_1D_BLOCK> IntBlockScan;
  __shared__ typename IntBlockScan::TempStorage sh_temp_storage_int_scan;
  __shared__ typename Int2BlockScan::TempStorage sh_temp_storage_int2_scan;

  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
    const int32 ichannel = lane_counters->channel_to_compute;
    ChannelCounters *channel_counters =
        cst_dev_params.d_channels_counters.channel(ichannel);

    int2 both = lane_counters->main_q_narcs_and_end;
    int32 main_q_narcs = both.x;
    int32 main_q_end = both.y;
    int32 main_q_local_offset = lane_counters->main_q_local_offset;
    const int32 main_q_global_offset = lane_counters->main_q_global_offset;
    // aux_q is empty when this kernel is called
    int32 aux_q_end = 0;
    IntegerCostType int_cutoff = lane_counters->int_cutoff;
    while (main_q_narcs > 0) {
      // Step 1 : ExpandArcs
      KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, thread_idx,
                                                     main_q_narcs) {
        const int32 main_q_arc_idx = offset + thread_idx;
        // For details on how this code works, please refer to comments in
        // expand_arcs
        IntegerCostType total_int_cost = INT_MAX;
        int32 arc_idx;
        StateId arc_next_state;
        int32 main_q_idx;
        if (main_q_arc_idx < main_q_narcs) {
          main_q_idx = binsearch_maxle(
              cst_dev_params.d_main_q_degrees_prefix_sum.channel(ichannel),
              main_q_arc_idx, main_q_local_offset, main_q_end - 1);

          const int32 state_first_arc_idx_in_main_q =
              cst_dev_params.d_main_q_degrees_prefix_sum.channel(
                  ichannel)[main_q_idx];
          const int32 arc_offset_start =
              cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx];
          arc_idx = arc_offset_start +
                    (main_q_arc_idx - state_first_arc_idx_in_main_q);

          arc_next_state = cst_dev_params.d_arc_nextstates[arc_idx];
          CostType arc_weight = cst_dev_params.d_arc_weights[arc_idx];
          CostType prev_token_cost =
              orderedIntToFloat(cst_dev_params.d_main_q_state_and_cost
                                    .channel(ichannel)[main_q_idx]
                                    .y);
          total_int_cost = floatToOrderedInt(arc_weight + prev_token_cost);
	  if(total_int_cost < lane_counters->min_int_cost)
            atomicMin(&lane_counters->min_int_cost, total_int_cost);
          if (total_int_cost >= int_cutoff) {
            total_int_cost = INT_MAX;  // above cutoff
          }
        }
        const int32 has_successor = (total_int_cost < INT_MAX) ? 1 : 0;

        int32 local_aux_q_idx;
        int32 nsuccessors;
        IntBlockScan(sh_temp_storage_int_scan)
            .ExclusiveSum(has_successor, local_aux_q_idx,
                          nsuccessors);  // aggregate

        // Checking if we are overflowing the aux_q
        if ((aux_q_end + nsuccessors) >= cst_dev_params.aux_q_capacity) {
          lane_counters->q_overflow |= OVERFLOW_AUX_Q;
          // nothing to revert in global memory
          goto finalize_lane;
        }

        if (has_successor) {
          const int32 aux_q_idx = aux_q_end + local_aux_q_idx;
          const int32 prev_token_idx = main_q_global_offset + main_q_idx;
          cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[aux_q_idx] = {
              arc_next_state, total_int_cost};
          cst_dev_params.d_aux_q_info.lane(ilane)[aux_q_idx] = {prev_token_idx,
                                                                arc_idx};
        }
        aux_q_end += nsuccessors;
        // sync: reusing sh_temp_storage_scan_int
        __syncthreads();
      }

      // Step 2 : PreprocessAndContract
      // Reset for new iteration
      main_q_narcs = 0;
      main_q_local_offset = main_q_end;
      KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, thread_idx,
                                                     aux_q_end) {
        const int32 aux_q_idx = offset + thread_idx;
        int32 degree = 0;
        int32 start = -1;
        StateId token_state;
        IntegerCostType token_int_cost;
        if (aux_q_idx < aux_q_end) {
          int2 both =
              cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[aux_q_idx];
          token_state = both.x;
          token_int_cost = both.y;
          // beam may have changed since generation
          // We are non-emitting in this kernel, using ne offsets
          start = cst_dev_params.d_arc_ne_offsets[token_state];
          int32 end = cst_dev_params.d_arc_ne_offsets[token_state + 1];
          degree = end - start;
        }
        int has_valid_nonpruned_token = (start != -1) ? 1 : 0;
        int2 narcs_and_ntokens_prefix_sum = {degree, has_valid_nonpruned_token};
        int2 aggregate, zero2 = {0, 0};
        Int2BlockScan(sh_temp_storage_int2_scan)
            .ExclusiveScan(narcs_and_ntokens_prefix_sum,
                           narcs_and_ntokens_prefix_sum, zero2, PlusPlus(),
                           aggregate);
        // Checking if we are not overflowing the main_q
        const int32 total_ntokens = aggregate.y;
        if ((main_q_end + total_ntokens) >= cst_dev_params.main_q_capacity) {
          lane_counters->q_overflow |= OVERFLOW_MAIN_Q;
          goto finalize_lane;
        }
        const int32 degree_prefix_sum =
            main_q_narcs + narcs_and_ntokens_prefix_sum.x;
        const int32 degree_sum = aggregate.x;
        main_q_narcs += degree_sum;
        if (has_valid_nonpruned_token) {
          const int32 local_main_q_idx = narcs_and_ntokens_prefix_sum.y;
          const int32 main_q_idx = main_q_end + local_main_q_idx;

          cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx] =
              start;
          cst_dev_params.d_main_q_degrees_prefix_sum.channel(
              ichannel)[main_q_idx] = degree_prefix_sum;
          cst_dev_params.d_main_q_state_and_cost.channel(
              ichannel)[main_q_idx] = {token_state, token_int_cost};
          cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx] =
              cst_dev_params.d_aux_q_info.lane(ilane)[aux_q_idx];
          cst_dev_params.d_main_q_acoustic_cost.lane(ilane)[main_q_idx] =
              0.0f;  // we are always nonemitting in this kernel
        }
        main_q_end += total_ntokens;
        __syncthreads();
      }
      aux_q_end = 0;  // aux_q is now empty
    }

  finalize_lane:
    if (threadIdx.x == 0) {
      // This main_q is now final for that frame
      lane_counters->main_q_narcs_and_end = {0, main_q_end};
      cst_dev_params.h_lanes_counters.lane(ilane)->main_q_narcs_and_end = {
          0, main_q_end};  // pinned memory
    }
  }
}

// GetBestCost :
// Finds all tokens with a cost in [min_cost;min_cost+lattice_beam[
// Add the final_costs if use_final_probs
// Does the computation in two steps
//
// Step 1: Find the value of min_cost, i.e. the minimum cost in the last token
// queue
// (the queue generated by the last frame computed)
// We set both channel_counters->min_int_cost_and_arg_without_final
// and channel_counters->min_int_cost_and_arg_with_final
// One add the final_cost[token.state] before looking for the min
__global__ void get_best_cost_step1_kernel(DeviceParams cst_dev_params,
                                           KernelParams params,
                                           bool use_final_probs) {
  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
    const int32 ichannel = lane_counters->channel_to_compute;
    ChannelCounters *channel_counters =
        cst_dev_params.d_channels_counters.channel(ichannel);
    const int32 main_q_end = channel_counters->prev_main_q_narcs_and_end.y;
    const int32 global_offset = channel_counters->prev_main_q_global_offset;
    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, main_q_end) {
      if (idx == 0)
        lane_counters->n_within_lattice_beam =
            0;  // will be used in the next kernel
      const int2 both =
          cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[idx];
      const int token_state = both.x;
      const int token_int_cost = both.y;
      CostType cost = orderedIntToFloat(token_int_cost);
      IntegerCostType int_cost = floatToOrderedInt(cost);
      int32 global_idx = global_offset + idx;
      // We know what is the min cost (without final costs)
      // we just need to have the index of one token with that min cost

      if (use_final_probs) {
        const CostType final_cost =
            cst_dev_params.d_fst_final_costs[token_state];
        IntegerCostType int_cost_with_final =
            floatToOrderedInt(cost + final_cost);
        if (final_cost != cst_dev_params.fst_zero) {
          int2 min_and_arg = {int_cost_with_final,
                              global_idx};  // sort by cost, put it first
          atomicMinI2(&channel_counters->min_int_cost_and_arg_with_final,
                      min_and_arg);
        }
      }
    }
  }
}

// Step2: Now that step1 found the min_cost (with and without final cost)
// If at least one final token (token associated with a final fst state)
// exists in the token queue, AND if use_final_probs is true,
// We can detect all tokens with a cost within [min_cost;min_cost+lattice_beam]
// and list them into d_list_final_tokens_in_main_q
__global__ void get_best_cost_step2_kernel(DeviceParams cst_dev_params,
                                           KernelParams params,
                                           bool use_final_probs) {
  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
    const int32 ichannel = lane_counters->channel_to_compute;
    const ChannelCounters *channel_counters =
        cst_dev_params.d_channels_counters.channel(ichannel);
    const int32 main_q_end = channel_counters->prev_main_q_narcs_and_end.y;
    const int32 global_offset = channel_counters->prev_main_q_global_offset;
    const int2 min_int_cost_and_arg_with_final =
        channel_counters->min_int_cost_and_arg_with_final;
    const int2 min_int_cost_and_arg_without_final =
        channel_counters->min_int_cost_and_arg_without_final;
    bool has_reached_final = (min_int_cost_and_arg_with_final.x != INT_MAX);
    // Use final if we want to use final (use_final_probs is true) and if we
    // found a final state in the token list
    bool compute_final = use_final_probs && has_reached_final;
    IntegerCostType min_cost_to_use =
        compute_final ? min_int_cost_and_arg_with_final.x
                      : min_int_cost_and_arg_without_final.x;

    // if token.cost < lattice_cutoff, that token will belong in the output
    // lattice
    CostType lattice_cutoff =
        orderedIntToFloat(min_cost_to_use) + cst_dev_params.lattice_beam;
    IntegerCostType lattice_int_cutoff = floatToOrderedInt(lattice_cutoff);
    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, main_q_end) {
      // First thread of each lane will move the results into lane counters.
      // That's because we never move channel counters back to host,
      // so we move those values to the lane counters, and those lane counters
      // will be moved to host after this kernel
      if (idx == 0) {
        // The lane counters will be copied to host
        lane_counters->min_int_cost_and_arg =
            compute_final ? min_int_cost_and_arg_with_final
                          : min_int_cost_and_arg_without_final;
        lane_counters->has_reached_final = has_reached_final;
      }
      // Looking for a token with its int_cost < lattice_int_cutoff
      const int2 both =
          cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[idx];
      const int32 token_state = both.x;
      int32 token_int_cost = both.y;
      if (compute_final) {
        const CostType final_cost =
            cst_dev_params.d_fst_final_costs[token_state];
        const CostType token_cost = orderedIntToFloat(token_int_cost);
        // final_cost == fst_zero -> this state is not final
        token_int_cost = (final_cost != cst_dev_params.fst_zero)
                             ? floatToOrderedInt(token_cost + final_cost)
                             : INT_MAX;
      }
      if (token_int_cost < lattice_int_cutoff) {
        // That token will be included in the lattice (last frame)
        // save it
        int list_idx = atomicAdd(&lane_counters->n_within_lattice_beam, 1);
        cst_dev_params.h_list_final_tokens_in_main_q.lane(ilane)[list_idx] = {
            global_offset + idx, token_int_cost};
      }
    }
  }
}
__global__ void get_best_cost_step3_kernel(DeviceParams cst_dev_params,
                                           KernelParams params) {
  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    LaneCounters *d_lanes_counters =
        cst_dev_params.d_lanes_counters.lane(ilane);
    LaneCounters *h_lanes_counters =
        cst_dev_params.h_lanes_counters.lane(ilane);
    h_lanes_counters->min_int_cost_and_arg =
        d_lanes_counters->min_int_cost_and_arg;
    h_lanes_counters->has_reached_final = d_lanes_counters->has_reached_final;
    h_lanes_counters->n_within_lattice_beam =
        d_lanes_counters->n_within_lattice_beam;
  }
}
// compute_costs_histogram_kernel
// Used in ApplyMaxActiveAndReduceBeam
// Compute the histogram of the token.cost in the main_q
__global__ void compute_costs_histogram_kernel(DeviceParams cst_dev_params,
                                               KernelParams params,
                                               bool use_aux_q) {
  const int nlanes = params.nlanes_used;
  typedef cub::BlockHistogram<BinId, KALDI_CUDA_DECODER_1D_BLOCK, 1,
                              KALDI_CUDA_DECODER_HISTO_NBINS + 1>
      BlockHistogram;
  __shared__ typename BlockHistogram::TempStorage temp_storage;
  __shared__ unsigned int smem_histogram[KALDI_CUDA_DECODER_HISTO_NBINS + 1];

  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
    const int32 ichannel = lane_counters->channel_to_compute;
    const int32 q_end = use_aux_q ? lane_counters->post_expand_aux_q_end
                                  : lane_counters->main_q_narcs_and_end.y;
    bool compute_max_active = lane_counters->compute_max_active;
    if (!compute_max_active) {
      if (q_end <= cst_dev_params.max_active) continue;  // nothing to do
      // Otherwise let's turn max active on for this frame and lane
      lane_counters->compute_max_active = true;
    }

    // Reset local histogram for this lane
    BlockHistogram(temp_storage).InitHistogram(smem_histogram);
    CostType min_histo_cost = lane_counters->min_histo_cost;
    CostType max_histo_cost = lane_counters->max_histo_cost;
    CostType bin_width = lane_counters->histo_bin_width;

    // We have a sync inside the loop, keeping all threads alive
    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(block_offset, thread_idx,
                                                   q_end) {
      const int32 q_idx = block_offset + thread_idx;
      // The last bin is for everything we don't want to count:
      // cost already above the beam, or non-valid tokens
      // It is the default bin
      BinId bin_id[1];
      bin_id[0] = KALDI_CUDA_DECODER_HISTO_NBINS;
      if (q_idx < q_end) {
        IntegerCostType int_cost =
            use_aux_q
                ? cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[q_idx].y
                : cst_dev_params.d_main_q_state_and_cost
                      .channel(ichannel)[q_idx]
                      .y;
        CostType cost = orderedIntToFloat(int_cost);
        CostType extra = cost - min_histo_cost;
	if(extra <= 0.0f) 
		bin_id[0] = 0;
  	else if (extra < max_histo_cost) {
          bin_id[0] = (BinId)__fdiv_rd(extra, bin_width)+1; // +1 because first bin is cost < min_histo_cost
        }
      }
      BlockHistogram(temp_storage).Composite(bin_id, smem_histogram);  // sync
      __syncthreads();  // reusing temp_storage
    }

    // Not using the macros 1D_LOOP because that loop is only within a CTA
    for (int32 bin_id_w = threadIdx.x;
         bin_id_w < KALDI_CUDA_DECODER_HISTO_NBINS;
         bin_id_w += KALDI_CUDA_DECODER_1D_BLOCK) {
      // Writing the local histo to global
      // We don't care about the last bin (cf above)
      int32 s_count = (int32)smem_histogram[bin_id_w];
      atomicAdd(&cst_dev_params.d_histograms.lane(ilane)[bin_id_w], s_count);
    }
    // Making sure we're done reading from smem
    __syncthreads();
  }
}

// update_beam_using_histogram_kernel
// used in ApplyMaxActiveAndReduceBeam
// uses the histogram computed in compute_costs_histogram_kernel
// to find where to cut (where to set the beam)
// to keep only ~max_active_ tokens.
// Important: use only one CTA per lane
__global__ void update_beam_using_histogram_kernel(DeviceParams cst_dev_params,
                                                   KernelParams params,
                                                   bool use_aux_q) {
  typedef cub::BlockScan<int, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
  __shared__ typename BlockScan::TempStorage temp_storage;

  const int nlanes = params.nlanes_used;
  const int max_active = cst_dev_params.max_active;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
    bool compute_max_active = lane_counters->compute_max_active;
    if (!compute_max_active) continue;  // nothing to do
    CostType beam = orderedIntToFloat(lane_counters->int_beam);
    CostType min_histo_cost = lane_counters->min_histo_cost;
    CostType bin_width = lane_counters->histo_bin_width;
    // We now have our histogram of the token costs (computed in the previous
    // kernel)
    // Each thread i is responsible for a bin i, with that bin containing ni
    // tokens.
    // We compute the prefix sum of those ni, ending up for each thread with
    // si=sum[i=1..i](ni)
    // If the thread i detects that si < max_active_ and s[i+1] >= max_active_,
    // then we will cut the beam at
    // the cost of the bin [i+1]
    //
    // Assert : one thread in a CTA is responsible for at most one bin
    // we will not iterate over bins
    assert(KALDI_CUDA_DECODER_HISTO_NBINS < KALDI_CUDA_DECODER_1D_BLOCK);
    int bin_id = threadIdx.x;
    int val = 0;
    if (bin_id < KALDI_CUDA_DECODER_HISTO_NBINS) 
      val = cst_dev_params.d_histograms.lane(ilane)[bin_id];
    
    int prefix_sum;
    BlockScan(temp_storage).ExclusiveSum(val, prefix_sum);

    if (prefix_sum < max_active && (prefix_sum + val) >= max_active) {
      // We found our new beam regarding min_histo_cost
      // Howevever, the current min_cost could be lower than min_histo_cost
      // we need to add that diff to the new beam
      CostType new_beam_for_histo_min_cost = bin_width * bin_id;
      CostType current_min_cost = orderedIntToFloat(lane_counters->min_int_cost);
      CostType new_beam = (min_histo_cost - current_min_cost) + new_beam_for_histo_min_cost;
      IntegerCostType new_int_beam = floatToOrderedInt(new_beam);
      // Saving our new beam for this lane
      lane_counters->int_beam = new_int_beam;
      lane_counters->adaptive_int_beam_with_validity_index.x = new_int_beam;
      lane_counters->int_cutoff = floatToOrderedInt(current_min_cost + new_beam);
    }
  }
}

//
// PostProcessingMainQueue kernels.
// all the following kernels are called when postprocessing a frame
//

// Filling hashmap values with the tokens that we have in the main queue
// We do that because multiple tokens associated with the same FST state
// (but with different arc_idx) can exist in the main_q. We need to detect
// that situation, count them, detect what the min_cost for that FST state is.
// It is done using a hashmap
__global__ void fill_hashmap_with_main_q_kernel(DeviceParams cst_dev_params,
                                                KernelParams params) {
  // Operator for the prefix sum inside the CUDA block
  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
    const int32 ichannel = lane_counters->channel_to_compute;
    ChannelCounters *channel_counters =
        cst_dev_params.d_channels_counters.channel(ichannel);

    const int32 main_q_end = lane_counters->main_q_narcs_and_end.y;
    int32 min_int_cost = lane_counters->min_int_cost;
    CostType min_cost = orderedIntToFloat(min_int_cost);
    const int32 global_offset = lane_counters->main_q_global_offset;
    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
      // Position of considered token in the main_q
      if (main_q_idx < main_q_end) {
        int2 both = cst_dev_params.d_main_q_state_and_cost.channel(
            ichannel)[main_q_idx];
        StateId token_state = both.x;
        IntegerCostType token_int_cost = both.y;
        if (min_int_cost == token_int_cost) {
          // remove offset = min_cost, set it to 0 explicitely
          token_int_cost = floatToOrderedInt(0.0f);
          channel_counters->min_int_cost_and_arg_without_final = {
              token_int_cost, global_offset + main_q_idx};
          lane_counters->prev_arg_min_int_cost = main_q_idx;
        } else {
          // remove offset = min_cost
          CostType token_cost = orderedIntToFloat(token_int_cost) - min_cost;
          token_int_cost = floatToOrderedInt(token_cost);
        }
        int local_idx, hash_idx;
        hashmap_insert_or_aggregate(cst_dev_params.d_hashmap_values.lane(ilane),
                                    token_state, token_int_cost, main_q_idx,
                                    cst_dev_params.hashmap_capacity, &local_idx,
                                    &hash_idx);
        cst_dev_params.d_main_q_n_extra_prev_tokens_local_idx.lane(
            ilane)[main_q_idx] = local_idx;
        cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx].y =
            token_int_cost;
        // If we have the min, saving its index for get best cost and the min
        // cost estimate of the next frame

        // Saving where that token.state ended up in the hashmap
        // false = this token is not the representative of this state
        // We will update representing_state once we know more (in the next
        // kernel)
        // We first need to add all tokens to the hashmap. Which will be the
        // case when
        // this kernel returns.
        SetFSTStateHashIndex(
            hash_idx, false,
            &cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx]);
      }

      if (main_q_idx == 0) {
        lane_counters->int_cutoff = floatToOrderedInt(
            orderedIntToFloat(lane_counters->int_cutoff) - min_cost);
      }
    }
  }
}

// preprocess_and_list_extra_prev_tokens_kernel_step[i] kernels
// Called in PostProcessingMainQueue
// They do two things:
// - do the "emitting preprocessing". I.e. doing the preprocessing necessary for
// the future ExpandArcsEmitting that may be done next (if the current frame is
// not the last one)
// It consists of filling the d_main_q_degrees_prefix_sum of the emitting arc
// degrees of the tokens + setting d_main_q_arc_offsets
// - when we have multiple tokens associated with the same FST state S, we will
// list them in d_main_q_extra_prev_tokens. We need to know where to put them in
// that array,
// so we'll compute a prefix_sum also to compute those indexes. We'll then save
// the location of each extra tokens list (its offset and size in
// d_main_q_extra_prev_tokens),
// and save it into d_main_q_info for later lattice processing
//
// First step : Reading the hashmap, detecting which token is representative for
// each FST state, which is decided by fill_hashmap_with_main_q_kernel()
// (we pick one of the best ones, with the best ones being the ones with the
// lowest cost)
// this representative will be responsible for K tokens, with K being the number
// of tokens associated with that FST state. We only considers the cases where K
// > 1,
// because if K == 1, then we will not store that token in the special list
// d_main_q_extra_prev_tokens
// Each representative is also the only token that will propagate emitting arcs
// for that FST state. Because a representative has the min_cost for that FST
// state, it is enough to only propagate
// that one
// Each representative counts the number of emitting arcs it is responsible for,
// and we will compute the prefix sum of the arc degrees
__global__ void emitting_preprocess_and_list_extra_prev_tokens_step1_kernel(
    DeviceParams cst_dev_params, KernelParams params) {
  // Operator for the prefix sum inside the CUDA block
  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
  __shared__ typename BlockScan::TempStorage sh_temp_storage;
  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    LaneCounters *lane_counters =
        cst_dev_params.d_lanes_counters.lane(ilane);
    const int32 main_q_end = lane_counters->main_q_narcs_and_end.y;
    // Final cutoff from last ExpandArc execution
    // The cutoff can have decreased since moving tokens to the main_q
    // min_cost cannot be lower than before (we only did non-emitting phases
    // since then)
    // but the adaptive beam may have lowered the beam
    const IntegerCostType int_cutoff = lane_counters->int_cutoff;
    // Keeping all threads in CTA alive
    // We'll __syncthreads()
    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(block_offset, thread_idx,
                                                   main_q_end) {
      // We'll take care of the token at index main_q_idx
      const int32 main_q_idx = block_offset + thread_idx;
      const int32 ichannel = lane_counters->channel_to_compute;
      // If that token is the representative of its FST state (token.next_state)
      // The representative of a FST state is the token with the lowest
      // token.cost for that FST state
      // If multiple tokens have token1.cost == token2.cost ==
      // min_cost_for_that_state, then one is picked (first come first serve,
      // was done in fill_hashmap_with_main_q_kernel)
      bool representing_state = false;
      // Number of emitting arcs for that token
      // Only the token representative of that FST state can have degree > 0
      int32 degree = 0;
      // If that token is representative of a FST state S,
      // and if multiple tokens are associated with that state S,
      // then n_extra_prev_token will contain their count
      int32 n_extra_prev_token = 0;
      if (main_q_idx < main_q_end) {
        int2 both = cst_dev_params.d_main_q_state_and_cost.channel(
            ichannel)[main_q_idx];
        StateId token_state = both.x;
        IntegerCostType token_int_cost = both.y;
        // Loading info about token.next_state. Is there multiple tokens for
        // that state ?
        // How many ? What's the min token.cost for that state ?
        int32 hash_idx;    // we saved the hash_idx after inserting
        bool bool_buffer;  // will always be false. We just need it to call the
                           // function
        GetFSTStateHashIndex(
            cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx],
            &hash_idx, &bool_buffer);
        HashmapValueT h_val =
            cst_dev_params.d_hashmap_values.lane(ilane)[hash_idx];
        // Token index of one of the token which the lowest token.cost for that
        // state
        uint32_t state_best_int_cost_argmin;
        GetArgFromPackedArgminUInt64(h_val.min_and_argmin_int_cost_u64, &state_best_int_cost_argmin);

        // Checking if we're the representative of that state
        representing_state = (main_q_idx == state_best_int_cost_argmin);
        // Saving the hash_idx of that fst state + if we're responsible for that
        // state
        SetFSTStateHashIndex(
            hash_idx, representing_state,
            &cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx]);

        // One of the best token for that state will represent that state in the
        // next frame
        if (representing_state) {
          if (token_int_cost < int_cutoff) {
            // Next step is emitting (next frame), using emitting offsets
            const int32 start = cst_dev_params.d_arc_e_offsets[token_state];
            const int32 end = cst_dev_params.d_arc_e_offsets[token_state + 1];
            degree = end - start;
            // Saving the start offset for the expand kernel
            // avoid a new random memory access
            cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx] =
                start;

	    // Saving best cost with final cost, to compute the final_extra_cost
	    // It seems like ~5% of all states are final, so the following atomic may be fine
	    // if necessary, we could first reduce locally at the CTA level
	    const CostType final_cost =
		    cst_dev_params.d_fst_final_costs[token_state];
	    if(final_cost != cst_dev_params.fst_zero) {
		    IntegerCostType token_int_cost_with_final = floatToOrderedInt(orderedIntToFloat(token_int_cost) + final_cost);
		    IntegerCostType int_relative_cost = token_int_cost_with_final; // - 0.0f, the min_cost was reset to 0.0f
		    atomicMin(&lane_counters->int_relative_cost, int_relative_cost);
	    }
          }
          // If that FST state has only one token associated to it, we store
          // that token directly in
          // d_main_q_info (its original place)
          // We only move it into the d_main_q_extra_prev_tokens list if
          // multiple tokens are associated to that state
          n_extra_prev_token = (h_val.count > 1) ? (h_val.count) : 0;
        }
      }

      // Computing a local prefix sum inside that CUDA block
      // Others kernels will take care of adding the necessary offset to those
      // local prefix sums
      int2 zeroi2 = {0, 0};
      int2 vali2 = {degree, n_extra_prev_token};
      int2 aggi2;
      BlockScan(sh_temp_storage)
          .ExclusiveScan(vali2, aggi2, zeroi2, PlusPlus());
      int32 degree_local_prefix_sum = aggi2.x;
      int32 n_extra_prev_token_prefix_sum = aggi2.y;

      if (main_q_idx < main_q_end) {
        // This is not the final global prefix sum
        // Other kernels will add the necessary offset
        cst_dev_params.d_main_q_degrees_prefix_sum.channel(
            ichannel)[main_q_idx] = degree_local_prefix_sum;
        cst_dev_params.d_main_q_extra_prev_tokens_prefix_sum.lane(
            ilane)[main_q_idx] = n_extra_prev_token_prefix_sum;
      }

      if (KALDI_CUDA_DECODER_IS_LAST_1D_THREAD()) {
        // Saving the local sum of degrees of that CUDA block
        // That's necessary to compute the global offset of that CUDA block,
        // and that offset is what we need to transform the local prefix sum
        // into a global prefix sum
        const int local_sum_index = block_offset / KALDI_CUDA_DECODER_1D_BLOCK;
        // the prefix sum was exclusive, adding missing value
        const int degree_inclusive_sum = degree_local_prefix_sum + degree;
        const int n_extra_prev_tokens_inclusive_sum =
            n_extra_prev_token_prefix_sum + n_extra_prev_token;
        cst_dev_params.d_main_q_block_sums_prefix_sum.lane(
            ilane)[local_sum_index] = {degree_inclusive_sum,
                                       n_extra_prev_tokens_inclusive_sum};
      }

      // Synchronization because:
      // - we may need to reuse sh_temp_storage if the for loop iterates (cf
      // CUB's doc)
      __syncthreads();
    }
  }
}

// In step1, we've computed the local (CTA-wide) prefix sums. We also have the
// local sums of each individual CTAs
// In this kernel, we will compute the offset of each CTA in the global prefix
// sum. We will then add those offsets in step3
// Only one CTA / lane
__global__ void emitting_preprocess_and_list_extra_prev_tokens_step2_kernel(
    DeviceParams cst_dev_params, KernelParams params) {
  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
  __shared__ typename BlockScan::TempStorage sh_temp_storage;
  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
    const int main_q_end = lane_counters->main_q_narcs_and_end.y;
    const int ntiles = KALDI_CUDA_DECODER_DIV_ROUND_UP(
        main_q_end, KALDI_CUDA_DECODER_1D_BLOCK);
    // Using block_offset loop to keep entire CTA alive (we're going to use
    // __syncthreads in CUB)
    int2 sum_so_far = {0, 0};
    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, thread_idx, ntiles) {
      const int32 itile = offset + thread_idx;
      const int2 zeroi2 = {0, 0};
      const int2 val =
          (itile < ntiles)
              ? cst_dev_params.d_main_q_block_sums_prefix_sum.lane(ilane)[itile]
              : zeroi2;

      int2 prefix_sum, sum;
      BlockScan(sh_temp_storage)
          .ExclusiveScan(val, prefix_sum, zeroi2, PlusPlus(), sum);
      PlusPlus pp;
      prefix_sum = pp(prefix_sum, sum_so_far);
      sum_so_far = pp(sum_so_far, sum);
      if (itile < ntiles) {
        cst_dev_params.d_main_q_block_sums_prefix_sum.lane(ilane)[itile] =
            prefix_sum;
      }
      if (itile == (ntiles - 1)) {
        const int32 total_narcs = prefix_sum.x + val.x;
        const int32 total_n_extra_prev_tokens = prefix_sum.y + val.y;
        lane_counters->main_q_narcs_and_end.x = total_narcs;
        lane_counters->main_q_n_extra_prev_tokens = total_n_extra_prev_tokens;
        assert(total_n_extra_prev_tokens >= 0 &&
               total_n_extra_prev_tokens <= main_q_end);
      }
    }
  }
}

// Step3: Uses the CTA offsets computed in step2 to transform the CTA-wide
// prefix sums to global prefix sums
// The representative of each FST states saves into the hashmap the location of
// the extra_prev_tokens of that state
// in d_main_q_extra_prev_tokens. That way each extra tokens will know where to
// write itself in the next kernel.
__global__ void emitting_preprocess_and_list_extra_prev_tokens_step3_kernel(
		DeviceParams cst_dev_params, KernelParams params) {
	const int nlanes = params.nlanes_used;
	KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
		const LaneCounters *lane_counters =
			cst_dev_params.d_lanes_counters.lane(ilane);
		const int32 ichannel = lane_counters->channel_to_compute;
		const int main_q_end = lane_counters->main_q_narcs_and_end.y;
		KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
			const int32 local_sum_idx = main_q_idx / KALDI_CUDA_DECODER_1D_BLOCK;
			const int2 local_sum_offset =
				cst_dev_params.d_main_q_block_sums_prefix_sum.lane(
						ilane)[local_sum_idx];
			cst_dev_params.d_main_q_degrees_prefix_sum.channel(
					ichannel)[main_q_idx] += local_sum_offset.x;
			int extra_prev_tokens_offset =
				cst_dev_params.d_main_q_extra_prev_tokens_prefix_sum.lane(
						ilane)[main_q_idx] +
				local_sum_offset.y;
			// Loading the hash index associate with token.state
			// If representative, store the location of the extra prev tokens list for
			// that state in the hashmap
			bool is_representative;
			int32 hash_idx;
			GetFSTStateHashIndex(
					cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx],
					&hash_idx, &is_representative);
                        if (is_representative) {
                          HashmapValueT &val =
                              cst_dev_params.d_hashmap_values.lane(
                                  ilane)[hash_idx];
                          uint32_t min;
                          GetMinFromPackedArgminUInt64(
                              val.min_and_argmin_int_cost_u64, &min);
                          unsigned long long new_pack;
                          PackArgminInUInt64(min, extra_prev_tokens_offset,
                                             &new_pack);
                          val.min_and_argmin_int_cost_u64 = new_pack;
                        }
		}
	}
}

// Step4: We now know where to store our extra prev tokens in
// d_main_q_extra_prev_tokens.
// We will now move the tokens that need to be moved (when multiple tokens are
// associated to the same FST state)
// into d_main_q_extra_prev_tokens. In d_main_q_info, we will store the location
// of that list [offset,size]
// so that when backtracking, when we read d_main_q_info[token_idx], we know
// where to look to have the list
// of the same-state tokens
// It is the last step of the
// emitting_preprocess_and_list_extra_prev_tokens_step[i]_kernel pipeline
__global__ void emitting_preprocess_and_list_extra_prev_tokens_step4_kernel(
    DeviceParams cst_dev_params, KernelParams params) {
  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    const LaneCounters *lane_counters =
        cst_dev_params.d_lanes_counters.lane(ilane);
    const int32 ichannel = lane_counters->channel_to_compute;
    const int main_q_end = lane_counters->main_q_narcs_and_end.y;
    // Previous frames have filled d_main_q_extra_prev_tokens.
    // d_main_q_extra_prev_tokens was then flushed to host. We want to set the
    // global
    // (global in the sense "for all frames") offset on where to read it the
    // h_all_tokens_extra_prev_tokens_ on host.
    // adding the main_q_extra_prev_tokens_global_offset for that
    const int prev_global_idx =
        lane_counters->main_q_extra_prev_tokens_global_offset;
    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
      // We'll take care of token at main_q_idx
      // Loading hashmap information about token.state
      bool is_representative;
      int32 hash_idx;
      GetFSTStateHashIndex(
          cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx],
          &hash_idx, &is_representative);

      HashmapValueT val = cst_dev_params.d_hashmap_values.lane(ilane)[hash_idx];
      // How many tokens are associated with that fst state token.state
      int same_count = val.count;
      bool must_move_to_extra_prev_tokens = (same_count > 1);
      if (must_move_to_extra_prev_tokens) {
        // Moving to the extra_prev_tokens list.
        // Some of those tokens have an extra cost (compared to the best cost
        // for that FST state)
        // Generating and saving that extra cost. We will use it when generating
        // the lattice.
        CostType token_cost = orderedIntToFloat(
            cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx]
                .y);
	uint32_t best_int_cost;
        // Where to write this state list in d_main_q_extra_prev_tokens
	uint32_t extra_prev_tokens_offset;
	unsigned long long pack = val.min_and_argmin_int_cost_u64;
	GetMinFromPackedArgminUInt64(pack, &best_int_cost);
	GetArgFromPackedArgminUInt64(pack, &extra_prev_tokens_offset);
        CostType best_cost = orderedIntToFloat((int)best_int_cost);
        CostType extra_cost = token_cost - best_cost;
	assert(!is_representative || extra_cost == 0.0f);
        // Loading the token to be moved
        InfoToken inf_tok =
            cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx];
        CostType acoustic_cost =
            cst_dev_params.d_main_q_acoustic_cost.lane(ilane)[main_q_idx];
        // Place of that specific token in the extra_prev_tokens sublist of that
        // specific FST state
        int32 local_idx =
            cst_dev_params.d_main_q_n_extra_prev_tokens_local_idx.lane(
                ilane)[main_q_idx];
        // Saving the location of the extra prev tokens for that state into that
        // InfoToken
        SetSameFSTStateTokensList(
            prev_global_idx + extra_prev_tokens_offset, same_count,
            &cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx]);
        // Where to write this token in d_main_q_extra_prev_tokens
        int32 list_idx = extra_prev_tokens_offset + local_idx;
        // Moving token. Also saving extra_cost
        cst_dev_params.d_main_q_extra_prev_tokens.lane(ilane)[list_idx] =
            inf_tok;
        cst_dev_params.d_main_q_extra_and_acoustic_cost.lane(
            ilane)[list_idx] = {extra_cost, acoustic_cost};
        assert(inf_tok.prev_token >= (lane_counters->main_q_global_offset -
                                      cst_dev_params.main_q_capacity) &&
               inf_tok.prev_token <=
                   (lane_counters->main_q_global_offset + main_q_end));
      }
    }
  }
}

// Clear the hashmaps after use
// Each element in the map has a representative in the main_q
// Everyone of those representatives has the responsability to reset their
// corresponding value in the hashmap
// Once this kernel returns, the hashmaps are cleared
__global__ void clear_hashmap_kernel(DeviceParams cst_dev_params,
                                     KernelParams params) {
  const int nlanes = params.nlanes_used;
  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
    const int main_q_end = lane_counters->main_q_narcs_and_end.y;
    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
      bool is_representative;
      int32 hash_idx;
      GetFSTStateHashIndex(
          cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx],
          &hash_idx, &is_representative);
      // Representative owns a state. Each representative resets its associated
      // token.state
      // in the hashmap
      if (is_representative) {
        cst_dev_params.d_hashmap_values.lane(ilane)[hash_idx] =
            KALDI_CUDA_DECODER_HASHMAP_NO_VAL;  // clear
      }
    }
  }
}

// Kernels wrappers

void SaveChannelsStateFromLanesKernel(const dim3 &grid, const dim3 &block,
                                      const cudaStream_t &st,
                                      const DeviceParams &cst_dev_params,
                                      const KernelParams &kernel_params) {
  save_channels_state_from_lanes_kernel<<<grid, block, 0, st>>>(cst_dev_params,
                                                                kernel_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void LoadChannelsStateInLanesKernel(const dim3 &grid, const dim3 &block,
                                    const cudaStream_t &st,
                                    const DeviceParams &cst_dev_params,
                                    const KernelParams &kernel_params) {
  load_channels_state_in_lanes_kernel<<<grid, block, 0, st>>>(cst_dev_params,
                                                              kernel_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void InitDecodingOnDeviceKernel(const dim3 &grid, const dim3 &block,
                                const cudaStream_t &st,
                                const DeviceParams &cst_dev_params,
                                const KernelParams &kernel_params) {
  init_decoding_on_device_kernel<<<grid, block, 0, st>>>(cst_dev_params,
                                                         kernel_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void InitializeInitialLaneKernel(const dim3 &grid, const dim3 &block,
                                 const cudaStream_t &st,
                                 const DeviceParams &cst_dev_params) {
  initialize_initial_lane_kernel<<<grid, block, 0, st>>>(cst_dev_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void ResetForFrameAndEstimateCutoffKernel(const dim3 &grid, const dim3 &block,
                                          const cudaStream_t &st,
                                          const DeviceParams &cst_dev_params,
                                          const KernelParams &kernel_params) {
  reset_for_frame_and_estimate_cutoff_kernel<<<grid, block, 0, st>>>(
      cst_dev_params, kernel_params);
}

template <bool IS_EMITTING>
void ExpandArcsKernel(const dim3 &grid, const dim3 &block,
                      const cudaStream_t &st,
                      const DeviceParams &cst_dev_params,
                      const KernelParams &kernel_params) {
  expand_arcs_kernel<IS_EMITTING><<<grid, block, 0, st>>>(cst_dev_params,
                                                          kernel_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

template <bool IS_EMITTING>
void PostExpandKernel(const dim3 &grid, const dim3 &block,
                      const cudaStream_t &st,
                      const DeviceParams &cst_dev_params,
                      const KernelParams &kernel_params) {
  post_expand_kernel<IS_EMITTING><<<grid, block, 0, st>>>(cst_dev_params,
                                                          kernel_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void PostContractAndPreprocessKernel(const dim3 &grid, const dim3 &block,
                                     const cudaStream_t &st,
                                     const DeviceParams &cst_dev_params,
                                     const KernelParams &kernel_params) {
  post_contract_and_preprocess_kernel<<<grid, block, 0, st>>>(cst_dev_params,
                                                              kernel_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void NonEmittingPreprocessAndContractKernel(const dim3 &grid, const dim3 &block,
                                            const cudaStream_t &st,
                                            const DeviceParams &cst_dev_params,
                                            const KernelParams &kernel_params) {
  nonemitting_preprocess_and_contract_kernel<<<grid, block, 0, st>>>(
      cst_dev_params, kernel_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void FillHashmapWithMainQKernel(const dim3 &grid, const dim3 &block,
                                const cudaStream_t &st,
                                const DeviceParams &cst_dev_params,
                                const KernelParams &kernel_params) {
  fill_hashmap_with_main_q_kernel<<<grid, block, 0, st>>>(cst_dev_params,
                                                          kernel_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void EmittingPreprocessAndListExtraPrevTokensStep1Kernel(
    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
    const DeviceParams &cst_dev_params, const KernelParams &kernel_params) {
  emitting_preprocess_and_list_extra_prev_tokens_step1_kernel<<<grid, block, 0,
                                                                st>>>(
      cst_dev_params, kernel_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void EmittingPreprocessAndListExtraPrevTokensStep2Kernel(
    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
    const DeviceParams &cst_dev_params, const KernelParams &kernel_params) {
  emitting_preprocess_and_list_extra_prev_tokens_step2_kernel<<<grid, block, 0,
                                                                st>>>(
      cst_dev_params, kernel_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void EmittingPreprocessAndListExtraPrevTokensStep3Kernel(
    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
    const DeviceParams &cst_dev_params, const KernelParams &kernel_params) {
  emitting_preprocess_and_list_extra_prev_tokens_step3_kernel<<<grid, block, 0,
                                                                st>>>(
      cst_dev_params, kernel_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void EmittingPreprocessAndListExtraPrevTokensStep4Kernel(
    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
    const DeviceParams &cst_dev_params, const KernelParams &kernel_params) {
  emitting_preprocess_and_list_extra_prev_tokens_step4_kernel<<<grid, block, 0,
                                                                st>>>(
      cst_dev_params, kernel_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void ComputeLaneOffsetsKernel(const dim3 &grid, const dim3 &block,
                              const cudaStream_t &st,
                              const DeviceParams &cst_dev_params,
                              const KernelParams &kernel_params) {
  compute_lane_offsets_kernel<<<grid, block, 0, st>>>(cst_dev_params,
                                                      kernel_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

template <typename T>
void ConcatenateLanesDataKernel(const dim3 &grid, const dim3 &block,
                                const cudaStream_t &st,
                                const DeviceParams &cst_dev_params,
                                const KernelParams &kernel_params,
                                const LaneMatrixView<T> &src, T *concat,
                                int32 *lane_offsets) {
  concatenate_lanes_data_kernel<<<grid, block, 0, st>>>(
      cst_dev_params, kernel_params, src, concat, lane_offsets);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void InitHashmapKernel(const dim3 &grid, const dim3 &block,
                       const cudaStream_t &st,
                       const DeviceParams &cst_dev_params) {
  init_hashmap_kernel<<<grid, block, 0, st>>>(cst_dev_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void ClearHashmapKernel(const dim3 &grid, const dim3 &block,
                        const cudaStream_t &st,
                        const DeviceParams &cst_dev_params,
                        const KernelParams &kernel_params) {
  clear_hashmap_kernel<<<grid, block, 0, st>>>(cst_dev_params, kernel_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void ComputeCostsHistogramKernel(const dim3 &grid, const dim3 &block,
                                 const cudaStream_t &st,
                                 const DeviceParams &cst_dev_params,
                                 const KernelParams &kernel_params,
                                 bool use_aux_q) {
  compute_costs_histogram_kernel<<<grid, block, 0, st>>>(
      cst_dev_params, kernel_params, use_aux_q);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void UpdateBeamUsingHistogramKernel(const dim3 &grid, const dim3 &block,
                                    const cudaStream_t &st,
                                    const DeviceParams &cst_dev_params,
                                    const KernelParams &kernel_params,
                                    bool use_aux_q) {
  update_beam_using_histogram_kernel<<<grid, block, 0, st>>>(
      cst_dev_params, kernel_params, use_aux_q);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void FinalizeProcessNonEmittingKernel(const dim3 &grid, const dim3 &block,
                                      const cudaStream_t &st,
                                      const DeviceParams &cst_dev_params,
                                      const KernelParams &kernel_params) {
  finalize_process_non_emitting_kernel<<<grid, block, 0, st>>>(cst_dev_params,
                                                               kernel_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void GetBestCostStep1Kernel(const dim3 &grid, const dim3 &block,
                            const cudaStream_t &st,
                            const DeviceParams &cst_dev_params,
                            const KernelParams &kernel_params, bool isfinal) {
  get_best_cost_step1_kernel<<<grid, block, 0, st>>>(
      cst_dev_params, kernel_params, isfinal);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void GetBestCostStep2Kernel(const dim3 &grid, const dim3 &block,
                            const cudaStream_t &st,
                            const DeviceParams &cst_dev_params,
                            const KernelParams &kernel_params, bool isfinal) {
  get_best_cost_step2_kernel<<<grid, block, 0, st>>>(
      cst_dev_params, kernel_params, isfinal);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

void GetBestCostStep3Kernel(const dim3 &grid, const dim3 &block,
                            const cudaStream_t &st,
                            const DeviceParams &cst_dev_params,
                            const KernelParams &kernel_params) {
  get_best_cost_step3_kernel<<<grid, block, 0, st>>>(cst_dev_params,
                                                     kernel_params);
  KALDI_DECODER_CUDA_CHECK_ERROR();
}

template void ExpandArcsKernel<true>(const dim3 &grid, const dim3 &block,
                                     const cudaStream_t &st,
                                     const DeviceParams &cst_dev_params,
                                     const KernelParams &params);
template void ExpandArcsKernel<false>(const dim3 &grid, const dim3 &block,
                                      const cudaStream_t &st,
                                      const DeviceParams &cst_dev_params,
                                      const KernelParams &params);
template void PostExpandKernel<true>(const dim3 &grid, const dim3 &block,
                                     const cudaStream_t &st,
                                     const DeviceParams &cst_dev_params,
                                     const KernelParams &params);
template void PostExpandKernel<false>(const dim3 &grid, const dim3 &block,
                                      const cudaStream_t &st,
                                      const DeviceParams &cst_dev_params,
                                      const KernelParams &params);

template void ConcatenateLanesDataKernel<InfoToken>(
    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
    const DeviceParams &cst_dev_params, const KernelParams &params,
    const LaneMatrixView<InfoToken> &src, InfoToken *concat,
    int32 *lane_offsets);

template void ConcatenateLanesDataKernel<CostType>(
    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
    const DeviceParams &cst_dev_params, const KernelParams &params,
    const LaneMatrixView<CostType> &src, CostType *concat, int32 *lane_offsets);

template void ConcatenateLanesDataKernel<float2>(
    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
    const DeviceParams &cst_dev_params, const KernelParams &params,
    const LaneMatrixView<float2> &src, float2 *concat, int32 *lane_offsets);

template void ConcatenateLanesDataKernel<int32>(
    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
    const DeviceParams &cst_dev_params, const KernelParams &params,
    const LaneMatrixView<int32> &src, int32 *concat, int32 *lane_offsets);

}  // end namespace cuda_decoder
}  // end namespace kaldi