generated from mflechl/python-ci-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathft_llama2.py
110 lines (83 loc) · 3.48 KB
/
ft_llama2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
'''script to finetune Llama2'''
#import getpass
import locale
import logging
import os
#import yaml
import json
import torch
import numpy as np
import pandas as pd
from ludwig.api import LudwigModel
locale.getpreferredencoding = lambda: "UTF-8"
np.random.seed(123)
def clear_cache():
'''clears the cuda cache'''
if torch.cuda.is_available():
torch.cuda.empty_cache()
def main(n_examples=500, is_test=False):
'''main function to run run fine-tuning'''
# os.environ["HUGGING_FACE_HUB_TOKEN"] = getpass.getpass("Token:")
assert os.environ["HUGGING_FACE_HUB_TOKEN"]
with open("arxiv_physics_instruct_30k.jsonl", encoding="utf-8") as f_in:
data1 = [json.loads(line) for line in f_in]
with open("arxiv_math_instruct_50k.jsonl", encoding="utf-8") as f_in:
data2 = [json.loads(line) for line in f_in]
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
df_both = pd.concat([df1, df2])
main_df = df_both.sample(frac=1, random_state=42)
main_df.reset_index(drop=True, inplace=True)
# We're going to create a new column called `split` where:
# 90% will be assigned a value of 0 -> train set
# 5% will be assigned a value of 1 -> validation set
# 5% will be assigned a value of 2 -> test set
# Calculate the number of rows for each split value
total_rows = len(main_df)
split_0_count = int(total_rows * 0.9)
split_1_count = int(total_rows * 0.05)
split_2_count = total_rows - split_0_count - split_1_count
# Create an array with split values based on the counts
split_values = np.concatenate(
[np.zeros(split_0_count), np.ones(split_1_count), np.full(split_2_count, 2)]
)
# Shuffle the array to ensure randomness
np.random.shuffle(split_values)
# Add the 'split' column to the DataFrame
main_df["split"] = split_values
main_df["split"] = main_df["split"].astype(int)
# We will just use a small fraction of this dataset.
main_df = main_df.head(n=n_examples)
print(main_df.head(3))
print(f"Total number of examples in the dataset: {main_df.shape[0]}")
# Calculating the length of each cell in each column
#df["num_characters_question"] = df["question"].apply(lambda x: len(x))
#df["num_characters_answer"] = df["answer"].apply(lambda x: len(x))
main_df["num_characters_question"] = main_df["question"].apply(len)
main_df["num_characters_answer"] = main_df["answer"].apply(len)
# Show Distribution
main_df.hist(column=["num_characters_question", "num_characters_answer"])
# Calculating the average
average_chars_instruction = main_df["num_characters_question"].mean()
# average_chars_input = main_df['num_characters_input'].mean()
average_chars_output = main_df["num_characters_answer"].mean()
print(
f"Average number of tokens in the instruction column: {(average_chars_instruction / 3):.0f}"
)
# print(f'Average number of tokens in the input column: {(average_chars_input / 3):.0f}')
print(
f"Average number of tokens in the output column: {(average_chars_output / 3):.0f}",
end="\n\n",
)
clear_cache()
# Using Llama-2, 5 epochs
if n_examples <= 10:
model = LudwigModel(config="config_ft_cpu.yaml", logging_level=logging.INFO)
else:
model = LudwigModel(config="config_ft.yaml", logging_level=logging.INFO)
if not is_test:
# results = model.train(dataset=main_df)
model.train(dataset=main_df)
return 0
if __name__ == "__main__":
main()