-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraining.py
117 lines (99 loc) · 3.83 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import pandas as pd
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import json
from rich.prompt import Prompt
class ConversationDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_length=512):
self.dataframe = dataframe
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.dataframe)
def __getitem__(self, index):
convo = self.dataframe.iloc[index]['conversation']
encoding = self.tokenizer(
convo,
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
input_ids = encoding['input_ids'].squeeze()
attention_mask = encoding['attention_mask'].squeeze()
labels = input_ids.clone()
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'labels': labels
}
def main():
# トレーニングモードの選択
mode = Prompt.ask("Based GPT", choices=["d", "default", "empty", "v2", "v2-medium", "trained"])
# モデルとトークナイザーのロード
if mode == "empty":
config = GPT2Config()
model = GPT2LMHeadModel(config)
tokenizer = GPT2Tokenizer.from_pretrained('openai-community/gpt2')
elif mode == "v2":
tokenizer = GPT2Tokenizer.from_pretrained('openai-community/gpt2')
model = GPT2LMHeadModel.from_pretrained('openai-community/gpt2')
elif mode == "v2-medium":
tokenizer = GPT2Tokenizer.from_pretrained('openai-community/gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('openai-community/gpt2-medium')
elif mode == "trained":
tokenizer = GPT2Tokenizer.from_pretrained('./trained_model')
model = GPT2LMHeadModel.from_pretrained('./trained_model')
else:
tokenizer = GPT2Tokenizer.from_pretrained('openai-community/gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('openai-community/gpt2-medium')
# パディングトークンの設定
tokenizer.pad_token = tokenizer.eos_token
# データのパスを取得
path = Prompt.ask("Path")
# JSONデータの読み込み
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
# 各会話を連結したテキストとして処理
conversations = []
for conversation in data:
convo_text = ""
for message in conversation:
role = message['role'] # input / output
content = message['content']
convo_text += f"<|{role}|>{content}<|end|>"
conversations.append(convo_text)
# データフレームの作成
df = pd.DataFrame({'conversation': conversations})
# データセットの準備
train_dataset = ConversationDataset(df, tokenizer)
# トレーニング引数の設定
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
learning_rate=3e-5,
warmup_steps=100,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
save_steps=500,
save_total_limit=2,
)
# トレーナーの初期化
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
)
# トレーニングの実行
trainer.train()
# モデルとトークナイザーの保存
model_save_path = "./trained_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model Saved to {model_save_path}")
if __name__ == "__main__":
main()