-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathtrain-test-split.py
44 lines (37 loc) · 1.53 KB
/
train-test-split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import pandas as pd
from termcolor import colored
from sklearn.model_selection import train_test_split
# Define variables
COLUMNS = ['Sentiment', 'Id', 'Date', 'Flag', 'User', 'Tweet']
# Read dataset
dataset = pd.read_csv('data/dataset.csv', names = COLUMNS, encoding = 'latin-1')
print(colored("Columns: {}".format(', '.join(COLUMNS)), "yellow"))
# Remove extra columns
print(colored("Useful columns: Sentiment and Tweet", "yellow"))
print(colored("Removing other columns", "red"))
dataset.drop(['Id', 'Date', 'Flag', 'User'], axis = 1, inplace = True)
print(colored("Columns removed", "red"))
# Train test split
print(colored("Splitting train and test dataset into 80:20", "yellow"))
X_train, X_test, y_train, y_test = train_test_split(dataset['Tweet'], dataset['Sentiment'], test_size = 0.20, random_state = 100)
train_dataset = pd.DataFrame({
'Tweet': X_train,
'Sentiment': y_train
})
print(colored("Train data distribution:", "yellow"))
print(train_dataset['Sentiment'].value_counts())
test_dataset = pd.DataFrame({
'Tweet': X_test,
'Sentiment': y_test
})
print(colored("Test data distribution:", "yellow"))
print(test_dataset['Sentiment'].value_counts())
print(colored("Split complete", "yellow"))
# Save train data
print(colored("Saving train data", "yellow"))
train_dataset.to_csv('data/train.csv', index = False)
print(colored("Train data saved to data/train.csv", "green"))
# Save test data
print(colored("Saving test data", "yellow"))
test_dataset.to_csv('data/test.csv', index = False)
print(colored("Test data saved to data/test.csv", "green"))