10
10
#include < map>
11
11
#include < string>
12
12
#include < vector>
13
+ #include < unordered_set>
13
14
14
15
// determine number of model parts based on the dimension
15
16
static const std::map<int , int > LLAMA_N_PARTS = {
@@ -123,6 +124,9 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
123
124
}
124
125
125
126
// load vocab
127
+
128
+ std::unordered_set<std::string> unprintable_characters = {" " , " �" , " ��" };
129
+
126
130
{
127
131
const int32_t n_vocab = model.hparams .n_vocab ;
128
132
@@ -131,7 +135,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
131
135
__func__, fname.c_str (), n_vocab, model.hparams .n_vocab );
132
136
return false ;
133
137
}
134
-
138
+
135
139
std::string word;
136
140
for (int i = 0 ; i < n_vocab; i++) {
137
141
uint32_t len;
@@ -140,6 +144,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
140
144
word.resize (len);
141
145
fin.read ((char *) word.data (), len);
142
146
147
+ if (unprintable_characters.find (word) != unprintable_characters.end ()) {
148
+ continue ;
149
+ }
150
+
143
151
vocab.token_to_id [word] = i;
144
152
vocab.id_to_token [i] = word;
145
153
@@ -792,7 +800,7 @@ int main(int argc, char ** argv) {
792
800
printf (" %6d -> '%s'\n " , embd_inp[i], vocab.id_to_token .at (embd_inp[i]).c_str ());
793
801
}
794
802
printf (" \n " );
795
- printf (" sampling parameters: temp = %f, top_k = %d, top_p = %f\n " , params.temp , params.top_k , params.top_p );
803
+ printf (" sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f \n " , params.temp , params.top_k , params.top_p , params. repeat_last_n , params. repeat_penalty );
796
804
printf (" \n\n " );
797
805
798
806
std::vector<gpt_vocab::id> embd;
@@ -801,6 +809,10 @@ int main(int argc, char ** argv) {
801
809
size_t mem_per_token = 0 ;
802
810
llama_eval (model, params.n_threads , 0 , { 0 , 1 , 2 , 3 }, logits, mem_per_token);
803
811
812
+ int last_n_size = params.repeat_last_n ;
813
+ std::vector<gpt_vocab::id> last_n_tokens (last_n_size);
814
+ std::fill (last_n_tokens.begin (), last_n_tokens.end (), 0 );
815
+
804
816
for (int i = embd.size (); i < embd_inp.size () + params.n_predict ; i++) {
805
817
// predict
806
818
if (embd.size () > 0 ) {
@@ -821,6 +833,7 @@ int main(int argc, char ** argv) {
821
833
// sample next token
822
834
const float top_p = params.top_p ;
823
835
const float temp = params.temp ;
836
+ const float repeat_penalty = params.repeat_penalty ;
824
837
825
838
const int n_vocab = model.hparams .n_vocab ;
826
839
@@ -829,7 +842,10 @@ int main(int argc, char ** argv) {
829
842
{
830
843
const int64_t t_start_sample_us = ggml_time_us ();
831
844
832
- id = llama_sample_top_p (vocab, logits.data () + (logits.size () - n_vocab), top_p, temp, rng);
845
+ id = llama_sample_top_p (vocab, logits.data () + (logits.size () - n_vocab), last_n_tokens, repeat_penalty, top_p, temp, rng);
846
+
847
+ last_n_tokens.erase (last_n_tokens.begin ());
848
+ last_n_tokens.push_back (id);
833
849
834
850
t_sample_us += ggml_time_us () - t_start_sample_us;
835
851
}
@@ -840,6 +856,8 @@ int main(int argc, char ** argv) {
840
856
// if here, it means we are still processing the input prompt
841
857
for (int k = i; k < embd_inp.size (); k++) {
842
858
embd.push_back (embd_inp[k]);
859
+ last_n_tokens.erase (last_n_tokens.begin ());
860
+ last_n_tokens.push_back (embd_inp[k]);
843
861
if (embd.size () > params.n_batch ) {
844
862
break ;
845
863
}
0 commit comments