-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProtGPT2_Input_preprocesing.py
61 lines (48 loc) · 1.9 KB
/
ProtGPT2_Input_preprocesing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import random
def substitute_fasta_headers(fasta_file):
sequences = []
with open(fasta_file, 'r') as file:
header = ""
sequence = ""
for line in file:
if line.startswith('>'):
if sequence != "":
sequences.append(sequence)
sequence = ""
header = line.strip()
else:
sequence += line.strip()
if sequence != "":
sequences.append(sequence)
# Substitute FASTA headers with the string "|endoftext|>"
sequences = ["<|endoftext|>" + seq for seq in sequences]
return sequences
def split_dataset(sequences, train_ratio):
random.shuffle(sequences)
num_sequences = len(sequences)
num_train = int(num_sequences * train_ratio)
train_set = sequences[:num_train]
validation_set = sequences[num_train:]
return train_set, validation_set
# Example usage
fasta_file = "./GFP_NCBI.fasta" # Replace with the path to your FASTA file
train_ratio = 0.985 # Modify the ratio as needed (e.g., 0.9 for 90/10 split)
test_ratio = 0.0 # Modify the ratio as needed (e.g., 0.9 for 90/10 split)
# Step 1: Substitute FASTA headers
sequences = substitute_fasta_headers(fasta_file)
# Step 2: Split dataset
train_set, validation_set = split_dataset(sequences, train_ratio)
validation_set, test_set = split_dataset(validation_set, test_ratio)
# Save sequences to separate files
train_file = "train.txt"
validation_file = "validation.txt"
test_file = "test.txt"
with open(train_file, 'w') as train_output:
train_output.write('\n'.join(train_set))
with open(validation_file, 'w') as validation_output:
validation_output.write('\n'.join(validation_set))
with open(test_file, 'w') as test_output:
test_output.write('\n'.join(test_set))
print("Train file saved as:", train_file)
print("Validation file saved as:", validation_file)
print("Test file saved as:", test_file)