-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraining.py
115 lines (99 loc) · 3.67 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import json
from rich.prompt import Prompt
# トレーニングモードの選択
mode = Prompt.ask("Based GPT", choices=["d", "default", "v1", "v2-base", "v2-small", "v2-medium", "trained"])
# モデルとトークナイザーのロード
if mode == "v1":
tokenizer = GPT2Tokenizer.from_pretrained('openai-community/gpt2')
model = GPT2LMHeadModel.from_pretrained('openai-community/gpt2')
elif mode == "v2-base":
tokenizer = GPT2Tokenizer.from_pretrained('openai-community/gpt2')
model = GPT2LMHeadModel.from_pretrained('openai-community/gpt2')
elif mode == "v2-small":
tokenizer = GPT2Tokenizer.from_pretrained('openai-community/gpt2-small')
model = GPT2LMHeadModel.from_pretrained('openai-community/gpt2-small')
elif mode == "v2-medium":
tokenizer = GPT2Tokenizer.from_pretrained('openai-community/gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('openai-community/gpt2-medium')
elif mode == "trained":
tokenizer = GPT2Tokenizer.from_pretrained('./trained_model')
model = GPT2LMHeadModel.from_pretrained('./trained_model')
else:
tokenizer = GPT2Tokenizer.from_pretrained('openai-community/gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('openai-community/gpt2-medium')
# パディングトークンの設定
tokenizer.pad_token = tokenizer.eos_token
# データのパスを取得
path = Prompt.ask("Path")
# JSONデータの読み込み
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
# 各会話を連結したテキストとして処理
conversations = []
for conversation in data:
convo_text = ""
for message in conversation:
role = message['role']
content = message['content']
if role == 'user':
convo_text += f"<user>{content}</user>"
elif role == 'assistant':
convo_text += f"<assistant>{content}</assistant>"
conversations.append(convo_text)
# データフレームの作成
df = pd.DataFrame({'conversation': conversations})
class ConversationDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_length=512):
self.dataframe = dataframe
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.dataframe)
def __getitem__(self, index):
convo = self.dataframe.iloc[index]['conversation']
encoding = self.tokenizer(
convo,
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
input_ids = encoding['input_ids'].squeeze()
attention_mask = encoding['attention_mask'].squeeze()
labels = input_ids.clone()
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'labels': labels
}
# データセットの準備
train_dataset = ConversationDataset(df, tokenizer)
# トレーニング引数の設定
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=5,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
warmup_steps=50,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=1,
learning_rate=5e-5
)
# トレーナーの初期化
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
)
# トレーニングの実行
trainer.train()
# モデルとトークナイザーの保存
model_save_path = "./trained_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model Saved to {model_save_path}")