import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

UNIT = "word" # unit of tokenization (char, word, sent)
MIN_LEN = 1 # minimum sequence length for training
MAX_LEN = 50 # maximum sequence length for training and inference
SRC_VOCAB_SIZE = 50000 # source vocabulary size (0: limitless)
TGT_VOCAB_SIZE = 50000 # target vocabulary size (0: limitless)

RNN_TYPE = "GRU" # GRU, LSTM
NUM_DIRS = 2 # number of directions (1: unidirectional, 2: bidirectional)
NUM_LAYERS = 2
HRE = (UNIT == "sent") # hierarchical recurrent encoding
EMBED = {"lookup": 300} # encoder embedding (cnn, rnn, lookup, sae)
HIDDEN_SIZE = 1000
DROPOUT = 0.5
LEARNING_RATE = 2e-4

ATTN = True # attention mechanism
COPY = False # copying mechanism

BEAM_SIZE = 1
BATCH_SIZE = 64

VERBOSE = 0 # 0: None, 1: attention heatmap, 2: beam search
EVAL_EVERY = 10
SAVE_EVERY = 10
NUM_DIGITS = 4 # number of decimal places to print

PAD, PAD_IDX = "<PAD>", 0 # padding
SOS, SOS_IDX = "<SOS>", 1 # start of sequence
EOS, EOS_IDX = "<EOS>", 2 # end of sequence
UNK, UNK_IDX = "<UNK>", 3 # unknown token

CUDA = torch.cuda.is_available()
torch.manual_seed(0) # for reproducibility
# torch.cuda.set_device(0)

assert ATTN != COPY