-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathrequest.pas
495 lines (449 loc) · 17.3 KB
/
request.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
unit Request;
{$mode ObjFPC}{$H+}
interface
uses
Classes, SysUtils,neuroengineapi, strutils,llama, Math, Generics.Collections, OpenAIClient, OpenAIDtos,OptionsForm;
type
TTokenList = class(specialize TList<Tllama_token>)
public
function Data(const P: Integer = 0): Pllama_token;
end;
type
TUpdate_token_callback = procedure (message:string);
// Handle Neuroengine.ai requests
TRequestThread = class(TThread)
protected
procedure Execute; override;
public
ModelName: string;
maxContextLen: Integer;
PromptToAnswer: Unicodestring;
Response: Unicodestring;
PartialAnswer: Unicodestring;
constructor Create(Service: string;maxCTXLen:Integer);
end;
// Handle Llama.cpp engine
TllamaCPPThread = class(TRequestThread)
private
// Llama context
Ctx : Pllama_context;
// Llama model
Model : Pllama_model;
// Llama model parameters
Params : Tllama_model_params;
// Vocabulary size
n_vocab: longint;
// Array to store candidate tokens
candidates: array of Tllama_token_data;
candidates_p: Tllama_token_data_array;
// Input embeddings
EmbdInp : TTokenList;
// Minimum number of kept candidates, number of probabilities,
// constants for calculations, and number of generated tokens
min_keep, n_probs, C,n_gen : Integer;
// Tokens for the sequence-to-sequence task
SSTokens: TTokenList;
// Logits (raw prediction scores) from the model
logits: Psingle;
// Token ID
token_id: Tllama_token;
// Maximum context size and maximum size for token lists
max_context_size,max_tokens_list_size: Integer;
// Flag indicating if the model is loaded
Loaded : boolean;
// Model file name
ModelFName: string;
protected
procedure Execute; override;
public
// Constructor with required parameters
constructor Create(ModelPath:string;llm: Pllama_model; Prams: Tllama_model_params;maxCTXLen:Integer);
end;
// Handle ChatGPT engine
TChatGPTThread = class(TRequestThread)
protected
// Overridden method to execute the thread
procedure Execute; override;
private
// OpenAI client instance
FClient: IOpenAIClient;
// List of messages in the conversation
FMessages: TChatCompletionRequestMessageList;
// Method to add a message into the conversation history
procedure AddMessage(const Role, Content: string);
// Method to ask a question and get an answer
function AskQuestion(const Question: string): string;
public
// Constructor with required parameters
constructor Create(ModelPath: string;apikey: string;maxCTXLen:Integer);
end;
implementation
function TTokenList.Data(const P: Integer = 0): Pllama_token;
begin
Result := @FItems[P];
end;
{TChatGPTThread}
// Initialize the TChatGPTThread object
constructor TChatGPTThread.Create(ModelPath: string;apikey: string;maxCTXLen:Integer);
begin
// Call the parent constructor to create the thread suspended
inherited Create(ModelPath,maxCTXLen); // Create the thread suspended
// Allocate memory for storing messages in the conversation
FMessages := TChatCompletionRequestMessageList.Create;
// Set up the OpenAI client with access token (API Key)
FClient := TOpenAIClient.Create;
FClient.Config.AccessToken := apikey;
end;
procedure TChatGPTThread.AddMessage(const Role, Content: string);
var
Msg: TChatCompletionRequestMessage;
begin
Msg := TChatCompletionRequestMessage.Create;
FMessages.Add(Msg);
Msg.Role := Role;
Msg.Content := Content;
end;
function TChatGPTThread.AskQuestion(const Question: string): string;
var
Request: TCreateChatCompletionRequest; // Object to hold the request parameters.
ResponseG: TCreateChatCompletionResponse; // Object to hold the response from the OpenAI API.
SourceMsg, TargetMsg: TChatCompletionRequestMessage; // Message objects to transfer data between the local message list and the request object.
begin
ResponseG := nil; // Initialize the response object as nil.
Request := TCreateChatCompletionRequest.Create; // Create a new request object.
try
AddMessage('user', Question); // Add the user's question to the messages list.
Request.Model := self.ModelName; // Set the requested model name.
Request.MaxTokens := self.maxContextLen; {// Set the maximum number of tokens (words/characters) allowed in context.
// Be cautious with this setting, as using too many tokens may exhaust your API quota.}
for SourceMsg in FMessages do // Iterate through the local messages list.
begin
TargetMsg := TChatCompletionRequestMessage.Create; // Create a new target message object.
Request.Messages.Add(TargetMsg); // Add the target message object to the request object's messages list.
TargetMsg.Role := SourceMsg.Role; // Copy the role ('system', 'user', or 'assistant') from source to target.
TargetMsg.Content := SourceMsg.Content; {// Copy the content (the text of the message) from source to target.
// Note that each message is limited by the max_tokens value set earlier.}
end;
try
ResponseG := FClient.OpenAI.CreateChatCompletion(Request); {// Send the request to the OpenAI API and store the result in ResponseG.}
except
Result := 'Connection error.'; {// If there was an issue connecting to the API, return an error message.}
exit;
end;
if Assigned(ResponseG.Choices) and (ResponseG.Choices.Count > 0) then
begin
Result := ResponseG.Choices[0].Message.Content; {// Get the generated response from the first choice in the Choices array.
// Store the response in the function's output variable.}
AddMessage('assistant', Result); {// Add the AI-generated response to the local messages list.}
end
else
Result := ''; {// If no valid choices are available, return an empty string.}
finally
Request.Free; {// Free allocated memory resources after use.}
ResponseG.Free;
end;
end;
{Starts a conversation via PromptToAnswer, gets AI response, then ends.}
procedure TChatGPTThread.Execute;
var
prompt:UnicodeString;
answer:UnicodeString;
p:Integer;
begin
Response:=AskQuestion(PromptToAnswer);
Terminate;
end;
{TllamaCPPThread}
// Constructor initializes Llama CPP Thread object with given parameters
// Args: ModelPath (string): Path to load the pretrained model from
// llm (Pllama_model): Pointer to Llama model instance
// Params (Tllama_model_params): Configuration settings for the model
// maxCTXLen (Integer): Maximum context length for the model
constructor TllamaCPPThread.Create(ModelPath:string;llm: Pllama_model; Prams: Tllama_model_params;maxCTXLen:Integer);
begin
inherited Create(ModelPath,maxCTXLen); // Create the thread suspended
self.ModelFName:=ModelPath;
self.Model:=llm;
self.Params:=Prams;
SSTokens := TTokenList.Create;
EmbdInp := TTokenList.Create;
end;
{ Purpose: Converts a given hexadecimal encoded string into its equivalent wide string representation.
Ignores invalid hexadecimal values and handles special cases where hexadecimal values appear
inside HTML tags by copying them directly without modification.
}
function HexToUnicode(const HexString: string): WideString;
var
HexValue: string;
CodePoint: LongWord;
i: Integer;
InsideHtmlTag: Boolean;
begin
Result := '';
i := 1;
InsideHtmlTag := False;
while i <= Length(HexString) do
begin
if HexString[i] = '<' then
begin
InsideHtmlTag := True;
Result := Result + HexString[i];
Inc(i);
end
else if HexString[i] = '>' then
begin
InsideHtmlTag := False;
Result := Result + HexString[i];
Inc(i);
end
else if (InsideHtmlTag) and (HexString[i] = '0') then
{
Handle specific case where we have a valid hexadecimal value embedded in an HTML tag
1) Skip over leading zeros ('0x')
2) Collect consecutive digits until reaching terminating condition (either another angle bracket or end of string)
3) Attempt conversion from collected hexadecimal string to Unicode codepoint
4) If successful, append resulting wide char to output string
5) Otherwise, throw exception indicating invalid hexadecimal value
}
begin
Inc(i); // Skip '0x'
Inc(i); // Skip '0x'
// Extract the hexadecimal value
HexValue := '';
while (i <= Length(HexString)) and (HexString[i] <> '>') do
begin
HexValue := HexValue + HexString[i];
Inc(i);
end;
// Convert the hexadecimal value to a Unicode code point
if TryStrToInt('$' + HexValue, LongInt(CodePoint)) then
Result := Result + WideChar(CodePoint)
else
raise Exception.Create('Invalid hexadecimal value: ' + HexValue);
end
else
begin
// Append non-hexadecimal characters or characters inside HTML tags as is
Result := Result + HexString[i];
Inc(i);
end;
end;
end;
{LLama.cpp Inference code}
// Procedure executed by the thread
procedure TllamaCPPThread.Execute;
var
// Declare variables used in the procedure
TokenStr: RawByteString;
answer: String;
p: Integer;
S: String;
N_THREADS: Integer;
TOP_K: Integer;
TOP_P: Single;
TYPICAL_P: Single;
TFS_Z: Single;
TEMP: Single;
uni: Byte;
ctxParams: Tllama_context_params;
begin
// Check if model exists, return error message if not present
if self.Model = nil then
begin
Response := 'This local AI is not loaded. Please open the AI file first.';
exit;
end;
// Assign basic parameter values from settings
N_THREADS := StrToIntDef(Settings.ComboBoxThreads.Text, 4);
TOP_K := StrToIntDef(Settings.LabeledEditK.Text, 40);
TOP_P := StrToFloatDef(Settings.LabeledEditP.Text, 0.88);
TYPICAL_P := 1.0;
TFS_Z := 1.0;
TEMP := StrToFloatDef(Settings.LabeledEditTemperature.Text, 1.0);
// Initialize exception handling
SetExceptionMask(GetExceptionMask + [exOverflow, exZeroDivide, exInvalidOp]);
// Llama backend initialization
llama_backend_init(False);
// Context parameter setup
ctxParams := llama_context_default_params;
ctxParams.n_threads := N_THREADS;
ctxParams.n_threads_batch := N_THREADS;
ctxParams.offload_kqv := Settings.CheckBoxGPUOffloadCache.Enabled;
// Create context with given model and parameters
Ctx := llama_new_context_with_model(Model, ctxParams);
// Calculate maximum context size and number of tokens based on context length limit
max_context_size := Min(llama_n_ctx(ctx), self.maxContextLen);
max_tokens_list_size := max_context_size - 4;
n_gen := Min(StrToIntDef(Settings.LabeledEditMaxLen.Text, 1024), max_context_size);
// Get the prompt text and tokenize it
S := PromptToAnswer;
EmbdInp.Count := Length(S) + 1;
C := llama_tokenize(Model, PChar(S), Length(S), EmbdInp.Data, EmbdInp.Count, False, False);
candidates:=[];
// Main loop starts here
while llama_get_kv_cache_token_count(ctx) < n_gen do
begin
// Check if the context has been terminated, and if so, exit the loop
if self.Terminated then break;
// Run the Llama evaluation function with the input data, model, and cache size
llama_eval(Ctx, EmbdInp.Data, C, llama_get_kv_cache_token_count(ctx));
EmbdInp.Clear;
// Sample the next token by initializing an array of candidate tokens and their corresponding logits
n_vocab := llama_n_vocab(Model);
SetLength(candidates, n_vocab);
logits := llama_get_logits(ctx);
for p := 0 to n_vocab - 1 do
begin
// Initialize each candidate token with its ID, logit, and probability (initially set to zero)
candidates[p].id := p;
candidates[p].logit := logits[p];
candidates[p].p := 0.0;
end;
// Prepare the candidates list for sorting and sampling
candidates_p.data := @candidates[0];
candidates_p.size := n_vocab;
candidates_p.sorted := False;
// Calculate various parameters used for sampling, such as number of probabilities, minimum keep value, etc.
n_probs:=1;
min_keep := Max(1, n_probs);
// Perform several different types of sampling on the candidate tokens based on specific criteria
llama_sample_top_k(ctx, @candidates_p, top_k, min_keep);
llama_sample_tail_free(ctx, @candidates_p, tfs_z, min_keep);
llama_sample_typical(ctx, @candidates_p, typical_p, min_keep);
llama_sample_top_p(ctx, @candidates_p, top_p, min_keep);
llama_sample_temperature(ctx, @candidates_p, temp);
// Select the final token ID based on the sampled probabilities
token_id:=llama_sample_token(ctx, @candidates_p);
// Alternative greedy sampling
// token_id:=llama_sample_token_greedy(ctx, @candidates_p);
if token_id = llama_token_eos(Model) then
begin
break;
end
else begin
TokenStr:=llama_token_get_text(Model, token_id);
TokenStr:=StringReplace(TokenStr,'▁',' ',[rfReplaceAll]);
TokenStr:=StringReplace(TokenStr,'Ġ',' ',[rfReplaceAll]);
if LowerCase(TokenStr)='user' then
break;
{Try to detect encoded chars, why you do this llama.cpp?}
if length(Tokenstr)=6 then
begin
if (TokenStr[1]='<') and (TokenStr[6]='>') then
begin
TokenStr := Copy(TokenStr, 4, Length(TokenStr) - 4); // Remove the angle brackets
p := hex2dec(TokenStr); // Convert hexadecimal to decimal
uni := p; // Convert decimal to char
TokenStr:=chr(uni);
end;
end;
// Based on whether the current token ID corresponds to a newline token, update the answer string accordingly
if token_id = llama_token_nl(Model) then
answer := answer + #10
else
answer := answer + TokenStr;
// Update the GUI with the partial answer, which includes the answer string so far
self.PartialAnswer := '<b><center>(Press ESC to stop generating)</center></b>' + answer;
// Add the current token ID to the list of embedded inputs
EmbdInp.Add(token_id);
// Reset the counter variable C to 1
C := 1;
end;
end;
llama_free(Ctx);
SSTokens.Free;
EmbdInp.Free;
Response:=answer;
self.PartialAnswer:='';
Terminate;
end;
{TRequestThread}
constructor TRequestThread.Create(Service: string;maxCTXLen:Integer);
begin
inherited Create(True); // Create the thread suspended
self.ModelName:=Service;
self.maxContextLen:=maxCTXLen;
end;
// Function converting a given Unicode string into its escaped sequence representation
function UnicodeToEscape(const UnicodeStr: string): AnsiString;
var
i, len: Integer;
AnsiChr: AnsiChar;
begin
Result := '';
len := Length(UnicodeStr);
i := 1;
while i <= len do
begin
if (UnicodeStr[i] = '#') and (i + 5 <= len) and (UnicodeStr[i + 1] = '$') then
begin
AnsiChr := AnsiChar(StrToInt('$' + Copy(UnicodeStr, i + 2, 4)));
Result := Result + '\' + AnsiChr;
Inc(i, 6);
end
else
begin
Result := Result + UnicodeStr[i];
Inc(i);
end;
end;
end;
{ Procedure handling request thread execution for Neuroengine.ai API}
procedure TRequestThread.Execute;
var
prompt, answer,TokenStr: UnicodeString;
p,i: Integer;
TOP_K, TOP_P, TEMP: Single;
max_new_len: Integer;
streamkey:String;
gettokens:Integer;
begin
{ Retrieve configuration values for K, P, and temperature parameters }
TOP_K := StrToIntDef(Settings.LabeledEditK.Text, 40);
TOP_P := StrToFloatDef(Settings.LabeledEditP.Text, 0.88);
TEMP := StrToFloatDef(Settings.LabeledEditTemperature.Text, 1.0);
max_new_len := Self.MaxContextLen;
{Generate streamkey}
Randomize;
SetLength(streamkey, 32);
for i := 1 to 32 do
streamkey[i] := Chr(Ord('A') + Random(26)); // Generates a random uppercase letter
{ Call QueryAI function to obtain AI generated answer }
answer:='';
i:=0;
gettokens:=20;
while (True) do
begin
// Check if the context has been terminated, and if so, exit the loop
if self.Terminated then break;
TokenStr := QueryAI(Self.ModelName, PromptToAnswer, TEMP, TOP_P, TOP_K, 1.2, max_new_len, 0, True,streamkey,gettokens);
{Increase tokens to get so the first is fast, but the optimize for llm time}
gettokens:=gettokens+5;
if Length(TokenStr)=0 then
break;
answer := answer + TokenStr;
if (i=0) then
begin
answer:=Copy(answer,Length(PromptToAnswer),Length(answer));
end;
{ Processing the obtained answer by removing prompts and extracting relevant content }
p:=PosEx('user:',LowerCase(answer));
if (p>0) then
Delete(answer,p-2,Length(answer));
// Update the GUI with the partial answer, which includes the answer string so far
self.PartialAnswer := '<b><center>(Press ESC to stop generating)</center></b>' + answer;
if (p>0) then break;
i:=i+1;
end;
{ Error checking and processing empty/null answers }
if Answer = '' then
begin
Response := 'Connection error.';
Terminate;
Exit;
end;
Response:=Trim(answer);
self.PartialAnswer:='';
Terminate;
end;
end.