Skip to content

Commit 723e4e9

Browse files
committedJul 29, 2023
kmers
0 parents  commit 723e4e9

27 files changed

+14029
-0
lines changed
 

‎README.md

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# k-mers
2+
Description and purpose of the project will come at a later point.
3+
4+
## Installation
5+
6+
### Prerequisites
7+
`g++`: through gcc installation
8+
`sqlite3`: Debian/Ubuntu: `apt install sqlite3`; macOS: `brew install sqlite3`
9+
`gzip / gunzip`: usually pre-installed
10+
`python3`: installed and part of path
11+
12+
### Preperation
13+
Clone the repo
14+
```
15+
git clone https://github.com/cemiu/kmers.git && cd kmers
16+
```
17+
Two folders need to be populated.
18+
#### pdb
19+
The pdb folder has to contain all experimental PDB file in the .ent.gz format.
20+
Instructions for downloading can be found here:
21+
https://www.rcsb.org/docs/programmatic-access/file-download-services
22+
https://files.wwpdb.org/pub/pdb/data/structures/divided/pdb/
23+
24+
Alternatively an outdated mirror or some can be found here: https://pycom.brunel.ac.uk/misc/pdb_2023-07-28.tar (42 GB)
25+
26+
Once downloaded they have to be placed in the `pdb` folder **without** being uncompressed. It does not matter whether they are in `pdb/file.ent.gz` or `pdb/<folder>/file.ent.gz`.
27+
28+
#### uniprotkb
29+
The project requires `uniprot_sprot.fasta.gz` (400 MB after processing)
30+
Optionally, `uniprot_trembl.fasta.gz` can be used, to match more PDBs (250 GB after processing).
31+
32+
The latter might result in (slightly) more PDBs which can be associated to a Protein. The difference is expected to be trivial.
33+
34+
Place the files in the `uniprotkb` folder without uncompressing them.
35+
By default, only Swiss-Prot is used. To also use TrEMBL, uncomment line 11 in `prepare.sh`.
36+
37+
### Running
38+
39+
To run the script, execute:
40+
```
41+
./prepare.sh
42+
```
43+
44+
This will
45+
- Compile C++ binaries
46+
- Process Swiss-Prot / TrEMBL into a database
47+
- (`uniprot_sprot.fasta.gz` / `uniprot_trembl.fasta.gz`) can be deleted afterwards
48+
- Extract 3d k-mer from the PDBs
49+
- TODO: process the k-mers
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
A
2+
B
3+
C
4+
D
5+
E
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
sdafasefewkj
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
#include <iostream>
2+
#include <unordered_map>
3+
#include <vector>
4+
#include <algorithm>
5+
6+
// using namespace std;
7+
8+
void printHelp() {
9+
std::cout << "Usage: ./program [-k <kmer size>] [-f] [-s] [--help]\n"
10+
<< "Options:\n"
11+
<< " -k <kmer size> Set the size of the kmers (default=12)\n"
12+
<< " -f Output frequencies only (default=false)\n"
13+
<< " -s Output statistics and missing kmers (default=false)\n"
14+
<< " --help, -h Show this help message\n";
15+
}
16+
17+
// Generate synthetic set using recursion
18+
// void generateSyntheticSet(const string& kmer, int k, const unordered_map<string, int>& naturalSet, const function<void(const string&)>& callback) {
19+
// static string aminoAcids = "ACDEFGHIKLMNPQRSTVWY";
20+
21+
// if (k == 0) {
22+
// if (naturalSet.find(kmer) == naturalSet.end()) {
23+
// callback(kmer);
24+
// }
25+
// return;
26+
// }
27+
28+
// for (char c : aminoAcids) {
29+
// generateSyntheticSet(kmer + c, k - 1, naturalSet, callback);
30+
// }
31+
// }
32+
33+
// Main function
34+
int main(int argc, char* argv[]) {
35+
std::ios_base::sync_with_stdio(false);
36+
std::cin.tie(NULL);
37+
38+
// Default values
39+
int kmerSize = 12;
40+
bool outputFrequenciesOnly = false;
41+
bool outputStatsAndMissingKmers = false;
42+
43+
44+
// Check command-line arguments
45+
for (int i = 1; i < argc; i++) {
46+
std::string arg = argv[i];
47+
if (arg == "-k") {
48+
if (i + 1 < argc) { // Make sure we aren't at the end of argv!
49+
kmerSize = std::stoi(argv[++i]); // Increment 'i' so we don't get the arguments confused.
50+
}
51+
else { // Uh-oh, there was no argument to the kmer option.
52+
std::cerr << "-k option requires one argument." << std::endl;
53+
return 1;
54+
}
55+
}
56+
else if (arg == "-f") {
57+
outputFrequenciesOnly = true;
58+
}
59+
else if (arg == "-s") {
60+
outputStatsAndMissingKmers = true;
61+
}
62+
else if (arg == "--help" || arg == "-h") {
63+
printHelp();
64+
return 0;
65+
}
66+
}
67+
68+
std::unordered_map<std::string, int> kmerCounts;
69+
70+
std::string inputLine;
71+
while(getline(std::cin, inputLine) && !inputLine.empty()) {
72+
if(inputLine.size() >= kmerSize) {
73+
std::string kmer = inputLine.substr(0,kmerSize);
74+
kmerCounts[kmer]++;
75+
}
76+
}
77+
78+
if (kmerCounts.empty()) {
79+
printHelp();
80+
return 1;
81+
}
82+
83+
// Transfer the unordered_map to vector of pairs for sorting.
84+
std::vector<std::pair<std::string, int>> sortedKmers(kmerCounts.begin(), kmerCounts.end());
85+
86+
std::sort(sortedKmers.begin(), sortedKmers.end(),
87+
[](const std::pair<std::string, int> &a, const std::pair<std::string, int> &b) {
88+
return a.second > b.second;
89+
});
90+
91+
for(const auto &kmer : sortedKmers) {
92+
if(outputFrequenciesOnly) {
93+
std::cout << kmer.second << std::endl;
94+
} else {
95+
std::cout << kmer.second << " " << kmer.first << std::endl;
96+
}
97+
}
98+
99+
// Output statistics and missing kmers
100+
// if (outputStatsAndMissingKmers) {
101+
// cout << "Coverage: " << static_cast<double>(kmerCounts.size()) / pow(20, kmerSize) * 100 << "% (" << kmerCounts.size() << "/" << static_cast<int>(pow(20, kmerSize)) << ")\n";
102+
// generateSyntheticSet("", kmerSize, kmerCounts, [](const string& missingKmer) {
103+
// cout << missingKmer << "\n";
104+
// });
105+
// }
106+
107+
return 0;
108+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#include "AtomDataParser.h"
2+
#include <string>
3+
#include <sstream>
4+
// #include <iostream>
5+
6+
void parseAtomData(const std::string& str, AtomData& data, size_t offset)
7+
{
8+
std::string atom_name = str.substr(-offset + 12, 4);
9+
std::stringstream atom_ss(atom_name);
10+
atom_ss >> atom_name;
11+
12+
data.isValidAtom = atom_name == "CA"; // whether the atom is ca
13+
14+
data.resName = str.substr(-offset + 17, 3);
15+
data.resSeq = std::stoi(str.substr(-offset + 22, 4));
16+
data.x = std::stof(str.substr(-offset + 30, 8));
17+
data.y = std::stof(str.substr(-offset + 38, 8));
18+
data.z = std::stof(str.substr(-offset + 46, 8));
19+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#ifndef ATOMDATAPARSER_H
2+
#define ATOMDATAPARSER_H
3+
4+
#include <string>
5+
6+
struct AtomData
7+
{
8+
bool isValidAtom; // whether the atom is valid (CA)
9+
std::string resName; // residue name (AA)
10+
int resSeq; // residue sequence number
11+
float x, y, z;
12+
};
13+
14+
void parseAtomData(const std::string& str, AtomData& data, size_t offset = 0);
15+
16+
#endif // ATOMDATAPARSER_H
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#include "Constants.h"
2+
3+
#define X(code, name) name,
4+
const char *code_name[] = {
5+
PDB_PARSING_CODES
6+
};
7+
#undef X
8+
9+
const float MAX_RESOLUTION = 2.5f;
10+
const std::unordered_map<std::string, char> aminoAcidLookup = {
11+
{"ALA", 'A'}, {"ARG", 'R'}, {"ASN", 'N'}, {"ASP", 'D'},
12+
{"CYS", 'C'}, {"GLN", 'Q'}, {"GLU", 'E'}, {"GLY", 'G'},
13+
{"HIS", 'H'}, {"HIP", 'H'}, {"HIE", 'H'}, {"ILE", 'I'},
14+
{"LEU", 'L'}, {"LYS", 'K'}, {"MET", 'M'}, {"PHE", 'F'},
15+
{"PRO", 'P'}, {"SER", 'S'}, {"THR", 'T'}, {"TYR", 'Y'},
16+
{"TRP", 'W'}, {"VAL", 'V'}, {"SEC", 'U'}, {"PYL", 'O'},
17+
{"XPL", 'O'}, // for pdb 1L2Q
18+
{"GLX", 'Z'}, // for pdb 1KP0
19+
{"ASX", 'B'} // for pdb 1KP0
20+
// 3e2o, 2fmd, 2atc, 4cpa
21+
22+
};
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#ifndef CONSTANTS_H
2+
#define CONSTANTS_H
3+
4+
#include <unordered_map>
5+
#include <string>
6+
7+
#define PDB_PARSING_CODES \
8+
X(SUCCESS, "SUCCESS") \
9+
X(RESOLUTION_TOO_LOW, "RESOLUTION_TOO_LOW") \
10+
X(RESOLUTION_NOT_SPECIFIED, "RESOLUTION_NOT_SPECIFIED") \
11+
X(MISSING_NON_TERMINAL_RESIDUES, "MISSING_NON_TERMINAL_RESIDUES") \
12+
X(NO_ALPHA_CARBON_ATOMS_FOUND, "NO_ALPHA_CARBON_ATOMS_FOUND") \
13+
X(IS_NOT_PROTEIN, "IS_NOT_PROTEIN") \
14+
X(EXCLUDE_RARE_AMINO_ACIDS, "EXCLUDE_RARE_AMINO_ACIDS") \
15+
X(HAS_UNKNOWN_RESIDUE, "HAS_UNKNOWN_RESIDUE") \
16+
X(INVALID_SEQUENCE, "INVALID_SEQUENCE") \
17+
X(NO_UNIPROT_ID, "NO_UNIPROT_ID") \
18+
19+
// rare amino acids = SELENOCYSTEINE, PYRROLYSINE, others
20+
21+
#define X(code, name) code,
22+
enum PDBParsingCode : size_t {
23+
PDB_PARSING_CODES
24+
MAX_PDB_PARSING_CODES
25+
};
26+
#undef X
27+
28+
extern const char *code_name[MAX_PDB_PARSING_CODES];
29+
30+
enum ResidueConfirmation {
31+
RESIDUE_VALID,
32+
RESIDUE_DUPLICATE, // same residue multiple times (e.g. multiple confirmation)
33+
RESIDUE_OUT_OF_SEQUENCE // missing non-terminal residue
34+
};
35+
36+
enum PDBType {PROTEIN, DNA, RNA, MISC};
37+
38+
extern const float MAX_RESOLUTION;
39+
extern const std::unordered_map<std::string, char> aminoAcidLookup;
40+
41+
#endif // CONSTANTS_H
42+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
#include "Utils.h"
2+
#include <sstream>
3+
#include <iterator>
4+
#include <unordered_set>
5+
#include <iostream>
6+
7+
#include "Constants.h"
8+
9+
std::string concatenateString(const std::vector<std::string>& strings) {
10+
const char delim = ',';
11+
std::ostringstream oss;
12+
13+
if (!strings.empty()) {
14+
// Convert all but the last element to avoid a trailing delimiter
15+
std::copy(strings.begin(), strings.end()-1,
16+
std::ostream_iterator<std::string>(oss, &delim));
17+
18+
// Now add the last element with no delimiter
19+
oss << strings.back();
20+
}
21+
22+
return oss.str();
23+
}
24+
25+
std::string concatenateString(const std::unordered_set<std::string>& strings) {
26+
const char delim = ',';
27+
std::ostringstream oss;
28+
29+
for (auto itr = strings.begin(); itr != strings.end(); ++itr) {
30+
if (itr != strings.begin()) {
31+
oss << delim;
32+
}
33+
oss << *itr;
34+
}
35+
36+
return oss.str();
37+
}
38+
39+
// Extracts the resolution from remark 2
40+
float extractResolution(const std::string &line) {
41+
std::string res_section = line.substr(23, 7);
42+
// skip empty remark line
43+
if (res_section == " ") {
44+
return -1;
45+
}
46+
try {
47+
return std::stof(line.substr(23, 7));
48+
} catch(std::invalid_argument) {
49+
return -2; // RESOLUTION. NOT APPLICABLE. (e.g. pdb 134D)
50+
}
51+
52+
return std::stof(line.substr(23, 7));
53+
}
54+
55+
/////////////////////
56+
/// PROCESS ENTRY ///
57+
/////////////////////
58+
59+
PDBType processHeader(const std::string &line) {
60+
std::string cls = line.substr(10, 40); // 11-50
61+
std::string pdbId = line.substr(62, 4); // 63-66
62+
63+
std::cout << "pdb_id: " << pdbId << std::endl;
64+
65+
if (cls.find("DNA") != std::string::npos) {
66+
if (cls.find("DNA BINDING PROTEIN") == std::string::npos)
67+
return DNA;
68+
}
69+
if (cls.find("RNA") != std::string::npos) {
70+
if (cls.find("RNA BINDING PROTEIN") == std::string::npos)
71+
return RNA;
72+
}
73+
74+
return PROTEIN;
75+
}
76+
77+
// Remark row
78+
void processRemark(const std::string &line, float &resolution) {
79+
int remark_no = std::stoi(line.substr(7, 3));
80+
switch (remark_no) {
81+
case 2:
82+
int extractedRes = extractResolution(line);
83+
if (extractedRes != -1) {
84+
resolution = extractResolution(line);
85+
}
86+
break;
87+
}
88+
}
89+
90+
// DBRef row
91+
void processDBRef(const std::string &line, std::unordered_set<std::string> &uniprotIds) {
92+
std::string db = line.substr(26, 6); // 27 - 32
93+
if (db != "UNP ") // only match uniprot
94+
return;
95+
96+
std::string uniprotId = line.substr(33, 8); // 34 - 41
97+
std::stringstream parser(uniprotId);
98+
parser >> uniprotId;
99+
uniprotIds.insert(uniprotId);
100+
}
101+
102+
void processDBRef1(const std::string &line, std::unordered_set<std::string> &uniprotIds) {
103+
// process 1 for uniprot
104+
std::string db = line.substr(26, 6); // 27 - 32
105+
if (db != "UNP ") // only match uniprot
106+
return;
107+
108+
// process 2 for id
109+
std::string nextLine;
110+
getline(std::cin, nextLine);
111+
112+
std::string uniprotId = nextLine.substr(18, 22); // 19 - 40
113+
std::stringstream parser(uniprotId);
114+
parser >> uniprotId;
115+
uniprotIds.insert(uniprotId);
116+
}
117+
118+
// SEQRES row
119+
void processSequence(const std::string &line, std::unordered_map<char, std::stringstream> &sequenceStreams) {
120+
char chainId = line[11];
121+
std::string aa;
122+
std::string aaLine = line.substr(19, 51);
123+
std::stringstream aaStream(aaLine);
124+
while (aaStream >> aa) {
125+
try {
126+
sequenceStreams[chainId] << aminoAcidLookup.at(aa);
127+
} catch (std::out_of_range) {
128+
// replace non-standard AA with dot (.)
129+
sequenceStreams[chainId] << '.';
130+
131+
// sequenceStreams.erase(chainId);
132+
// throw std::out_of_range("test");
133+
return;
134+
}
135+
}
136+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#ifndef UTILS_H
2+
#define UTILS_H
3+
4+
#include <vector>
5+
#include <string>
6+
#include <unordered_set>
7+
8+
#include "Constants.h"
9+
10+
std::string concatenateString(const std::vector<std::string>& strings);
11+
std::string concatenateString(const std::unordered_set<std::string>& strings);
12+
13+
PDBType processHeader(const std::string &line);
14+
float extractResolution(const std::string &line);
15+
void processRemark(const std::string &line, float &resolution);
16+
void processDBRef(const std::string &line, std::unordered_set<std::string> &uniprotIds);
17+
void processDBRef1(const std::string &line, std::unordered_set<std::string> &uniprotIds);
18+
void processSequence(const std::string &line, std::unordered_map<char, std::stringstream> &sequenceStreams);
19+
20+
#endif // UTILS_H
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,284 @@
1+
#include <iostream>
2+
#include <unordered_map>
3+
#include <string>
4+
#include <vector>
5+
#include <unordered_set>
6+
#include <sstream>
7+
8+
#include "AtomDataParser.h"
9+
#include "Constants.h"
10+
#include "Utils.h"
11+
12+
13+
// bool invalidSequence = false;
14+
bool invalidAA = false;
15+
bool hasUnknownResidues = false;
16+
std::stringstream parsedSequence;
17+
std::vector<std::string> errorOutput;
18+
19+
int firstCAResidue = 0;
20+
21+
ResidueConfirmation validateAtomSequence(int &prevCAResiduePosition, const int &resSeq) {
22+
if (prevCAResiduePosition + 1 != resSeq) {
23+
if (prevCAResiduePosition == -1) { // initial value
24+
prevCAResiduePosition = resSeq;
25+
firstCAResidue = resSeq;
26+
return RESIDUE_VALID;
27+
}
28+
29+
if (prevCAResiduePosition == resSeq)
30+
return RESIDUE_DUPLICATE;
31+
32+
std::stringstream errorString;
33+
errorString << "missing residues; prev=" << prevCAResiduePosition << ", next=" << resSeq;
34+
errorOutput.push_back(errorString.str());
35+
36+
prevCAResiduePosition = resSeq;
37+
return RESIDUE_OUT_OF_SEQUENCE;
38+
}
39+
prevCAResiduePosition = resSeq;
40+
return RESIDUE_VALID;
41+
}
42+
43+
//////////////////////////
44+
//////////////////////////
45+
///// PROCESS ROWS ///////
46+
//////////////////////////
47+
//////////////////////////
48+
49+
void processAtom(std::string &line, std::vector<std::string> &output, int &prevCAResiduePosition, bool &isSequenceValid) {
50+
AtomData data;
51+
parseAtomData(line, data, 0);
52+
if (!data.isValidAtom) {
53+
return;
54+
}
55+
56+
switch(validateAtomSequence(prevCAResiduePosition, data.resSeq)) {
57+
case RESIDUE_VALID:
58+
break; // continue
59+
case RESIDUE_DUPLICATE:
60+
return; // skip to next
61+
case RESIDUE_OUT_OF_SEQUENCE:
62+
isSequenceValid = false;
63+
break;
64+
}
65+
66+
char aminoAcid;
67+
try {
68+
aminoAcid = aminoAcidLookup.at(data.resName);
69+
} catch (std::out_of_range) { // should never throw if pdb is valid & not unknown
70+
if (data.resName == "UNK") {
71+
hasUnknownResidues = true;
72+
return;
73+
}
74+
75+
throw std::runtime_error("Unexpected atom type: " + data.resName);
76+
}
77+
78+
// Selenocysteine, Pyrrolysine, GLX, ASX, too rare, skip
79+
if (aminoAcid == 'U' || aminoAcid == 'O' || aminoAcid == 'Z' || aminoAcid == 'B') {
80+
invalidAA = true;
81+
}
82+
83+
// construct output string
84+
std::stringstream ss;
85+
ss << aminoAcid << ' ' << data.x << ' ' << data.y << ' ' << data.z;
86+
// std::cout << aminoAcid << ' ' << data.x << ' ' << data.y << ' ' << data.z << std::endl;
87+
88+
// construct sequence string
89+
parsedSequence << aminoAcid;
90+
91+
output.push_back(ss.str());
92+
}
93+
94+
// Checks whether the parsed input, so far, produced a valid, sequential
95+
// list of residues with coordinates.
96+
// Returns PDBParsingCode.SUCCESS if successful, and a specific error code
97+
// otherwise.
98+
PDBParsingCode isPDBInvalid(float &resolution, bool &isSequenceValid, int &prevCAResiduePosition, bool &anyCAAtomsPresent) {
99+
bool isResolutionValid = resolution < 2.5;
100+
if (!isResolutionValid) { // resolution too low
101+
return RESOLUTION_TOO_LOW;
102+
}
103+
104+
if (resolution == -1) // no valid resolution remark returned
105+
return RESOLUTION_NOT_SPECIFIED;
106+
107+
if (!isSequenceValid) // missing non-terminal residues
108+
return MISSING_NON_TERMINAL_RESIDUES;
109+
110+
if (prevCAResiduePosition == -1) { // no single CA atom found
111+
if (anyCAAtomsPresent) // if any model had, but last one didn't
112+
return MISSING_NON_TERMINAL_RESIDUES;
113+
return NO_ALPHA_CARBON_ATOMS_FOUND;
114+
}
115+
116+
return SUCCESS;
117+
}
118+
119+
PDBParsingCode isPDBInvalid(float &resolution, bool &isSequenceValid, int &prevCAResiduePosition, bool &anyCAAtomsPresent, bool &isNotProtein) {
120+
if (isNotProtein)
121+
return IS_NOT_PROTEIN;
122+
// if (invalidSequence)
123+
// return INVALID_SEQUENCE;
124+
if (hasUnknownResidues)
125+
return HAS_UNKNOWN_RESIDUE;
126+
if (invalidAA)
127+
return EXCLUDE_RARE_AMINO_ACIDS;
128+
return isPDBInvalid(resolution, isSequenceValid, prevCAResiduePosition, anyCAAtomsPresent);
129+
}
130+
131+
void resetPDBOutput(std::vector<std::string> &output, bool &isSequenceValid, int &prevCAResiduePosition, bool &anyCAAtomsPresent) {
132+
if (!anyCAAtomsPresent && output.size()) {
133+
anyCAAtomsPresent = true;
134+
}
135+
136+
output.clear();
137+
isSequenceValid = true;
138+
hasUnknownResidues = false;
139+
invalidAA = false;
140+
prevCAResiduePosition = -1;
141+
firstCAResidue = 0;
142+
parsedSequence.str("");
143+
}
144+
145+
bool hasMatched = true;
146+
147+
std::vector<std::string> processSequences(std::unordered_map<char, std::stringstream> &sequenceStreams) {
148+
std::unordered_set<std::string> uniqueSequences;
149+
std::vector<std::string> sequences;
150+
std::string matchedSequence = "N/A";
151+
152+
if (sequenceStreams.size() == 0)
153+
return sequences;
154+
155+
for (const auto & [_chainId, stream] : sequenceStreams) {
156+
auto sequence = stream.str();
157+
if (sequence.size() != 0) {
158+
auto sequencePosition = sequence.find(parsedSequence.str());
159+
if (parsedSequence.str().size() > 0 && sequencePosition != std::string::npos) {
160+
matchedSequence = sequence;
161+
} else {
162+
uniqueSequences.insert(sequence);
163+
}
164+
}
165+
}
166+
167+
// line 4: matched sequence (parsed contained within matched)
168+
if (matchedSequence != "")
169+
std::cout << "matched: " << matchedSequence << std::endl;
170+
// line 5: sequence parsed from ATOM records
171+
std::cout << "parsed: " << parsedSequence.str() << std::endl;
172+
173+
sequences.push_back(matchedSequence);
174+
175+
// line 6+: all other parsed sequences
176+
for (std::string seq: uniqueSequences) {
177+
std::cout << "other: " << seq << std::endl;
178+
sequences.push_back(seq);
179+
}
180+
181+
if (matchedSequence == "N/A")
182+
hasMatched = false;
183+
184+
return sequences;
185+
}
186+
187+
// Small script for parsing PDB files.
188+
// Takes in a stream of a PDB file as input.
189+
int main() {
190+
std::ios_base::sync_with_stdio(false);
191+
std::cin.tie(NULL);
192+
193+
std::vector<std::string> output;
194+
std::unordered_set<std::string> uniprotIds;
195+
bool isSequenceValid = true;
196+
bool processed_atom = false;
197+
bool anyCAAtomsPresent = false;
198+
bool isNotProtein = false;
199+
auto resolution = -1.0f;
200+
int prevCAResiduePosition = -1;
201+
202+
std::unordered_map<char, std::stringstream> sequenceStreams;
203+
204+
std::string line;
205+
while (getline(std::cin, line)) {
206+
std::string param = line.substr(0, 6);
207+
208+
if (param == "HEADER") {
209+
auto headerType = processHeader(line);
210+
if (headerType != PROTEIN) {
211+
isNotProtein = true;
212+
// break;
213+
}
214+
} else if (param == "REMARK") {
215+
processRemark(line, resolution);
216+
} else if (param == "DBREF ") {
217+
processDBRef(line, uniprotIds);
218+
} else if (param == "DBREF1") {
219+
processDBRef1(line, uniprotIds);
220+
} else if (param == "SEQRES") {
221+
processSequence(line, sequenceStreams);
222+
} else if (param == "ATOM ") { // HETATM residues are skipped
223+
processAtom(line, output, prevCAResiduePosition, isSequenceValid);
224+
} else if (param == "TER ") { // end of one chain
225+
auto pdbValidity = isPDBInvalid(resolution, isSequenceValid, prevCAResiduePosition, anyCAAtomsPresent);
226+
// std::cout << code_name[pdbValidity] << std::endl;
227+
if (pdbValidity == SUCCESS)
228+
break; // terminate parser, output PDB
229+
230+
// if at first you don't succeed, try, try again (parse next model)
231+
resetPDBOutput(output, isSequenceValid, prevCAResiduePosition, anyCAAtomsPresent);
232+
} // else ignore line, until end is reached
233+
}
234+
235+
// line 1 -- pdb id
236+
// "pdb_id: 201L" (printed in Utils.cpp)
237+
238+
// line 2 -- resolution
239+
std::cout << "resolut: " << resolution << std::endl;
240+
241+
// line 3 -- uniprot IDs
242+
std::string allUniprotIds = concatenateString(uniprotIds);
243+
std::cout << "uniprot: " << allUniprotIds << std::endl;
244+
245+
// line 4 -- matched sequence (atom record substring of reqres)
246+
// line 5 -- parsed sequence (atom records)
247+
// line 6-n -- other sequences (reqres sequence)
248+
std::vector<std::string> sequences = processSequences(sequenceStreams);
249+
250+
251+
auto pdbValidity = isPDBInvalid(resolution, isSequenceValid, prevCAResiduePosition, anyCAAtomsPresent, isNotProtein);
252+
if (pdbValidity != SUCCESS) {
253+
std::cerr << code_name[pdbValidity] << std::endl;
254+
255+
for (auto error : errorOutput)
256+
std::cout << error << std::endl;
257+
return pdbValidity;
258+
}
259+
260+
if (uniprotIds.size() == 0) {
261+
std::cerr << code_name[NO_UNIPROT_ID] << std::endl;
262+
return NO_UNIPROT_ID;
263+
}
264+
265+
// if (!hasMatched) {
266+
// std::cerr << "NO_MATCH" << std::endl;
267+
// return 103;
268+
// // std::cerr << "NO MATCHED SEQUENCE" << std::endl;
269+
// }
270+
271+
// line n+1: sequence number of initial residue (starts with 1)
272+
std::cout << "initres: " << firstCAResidue << std::endl;
273+
274+
// line n+2: empty line
275+
std::cout << std::endl;
276+
277+
// lines n+3 to end: coordinates in format <residue> <x> <y> <z>
278+
for (std::string pos : output) {
279+
std::cout << pos << std::endl;
280+
}
281+
282+
return 0;
283+
}
284+

‎cpp_scripts/extract_pdb_coordinates/inputf.txt

+2,261
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
pdb_id: 2SPC
2+
resolut: 1.8
3+
uniprot: P13395
4+
matched: QNLDLQLYMRDCELAESWMSAREAFLNADDDANAGGNVEALIKKHEDFDKAINGHEQKIAALQTVADQLIAQNHYASNLVDEKRKQVLERWRHLKEGLIEKRSRLGD
5+
parsed: QNLDLQLYMRDCELAESWMSAREAFLNADDDANAGGNVEALIKKHEDFDKAINGHEQKIAALQTVADQLIAQNHYASNLVDEKRKQVLERWRHLKEGLIEKRSRLGD
6+
initres: 0
7+
8+
Q 9.907 11.72 54.535
9+
N 12.788 14.14 53.949
10+
L 11.802 16.244 50.883
11+
D 15.003 15.15 49.161
12+
L 14.068 11.571 49.452
13+
Q 10.65 12.624 48.133
14+
L 12.115 14.246 45.02
15+
Y 14.073 11.064 44.305
16+
M 11.012 8.87 44.708
17+
R 9.095 11.152 42.291
18+
D 11.987 11.014 39.843
19+
C 12.022 7.185 40.088
20+
E 8.282 7.058 39.496
21+
L 8.617 9.386 36.438
22+
A 11.462 7.139 35.063
23+
E 9.297 4.066 35.531
24+
S 6.386 5.701 33.763
25+
W 8.623 6.706 30.78
26+
M 9.802 3.061 30.69
27+
S 6.172 1.93 30.638
28+
A 5.384 4.178 27.708
29+
R 8.346 2.567 25.825
30+
E 7.01 -0.902 26.678
31+
A 3.663 0.075 25.043
32+
F 5.556 1.418 21.96
33+
L 7.357 -1.841 21.506
34+
N 4.03 -3.527 21.436
35+
A 2.384 -1.339 18.87
36+
D 5.547 -1.733 16.805
37+
D 5.224 -5.496 16.92
38+
D 1.71 -5.1 15.512
39+
A 3.188 -3.413 12.374
40+
N 6.848 -2.398 12.824
41+
A 9.052 -0.791 10.222
42+
G 9.794 -2.047 6.675
43+
G 12.498 -4.177 4.969
44+
N 13.967 -1.043 3.216
45+
V 17.68 -0.537 3.979
46+
E 17.636 3.194 4.641
47+
A 14.899 2.888 7.229
48+
L 15.787 -0.448 8.811
49+
I 18.88 1.405 9.657
50+
K 17.038 4.571 10.711
51+
K 14.888 2.479 13.062
52+
H 17.982 0.769 14.366
53+
E 19.512 4.11 15.342
54+
D 16.265 5.007 17.029
55+
F 16.484 1.922 19.286
56+
D 20.138 2.691 20.086
57+
K 19.077 6.24 20.997
58+
A 16.243 4.876 23.217
59+
I 18.463 2.373 24.949
60+
N 21.093 4.995 25.504
61+
G 18.802 7.603 27.025
62+
H 17.291 4.974 29.279
63+
E 20.699 3.842 30.514
64+
Q 21.74 7.36 31.254
65+
K 18.626 8.228 33.163
66+
I 19.07 4.99 35.165
67+
A 22.673 5.942 36.042
68+
A 21.712 9.362 37.233
69+
L 19.069 7.939 39.585
70+
Q 21.583 5.512 40.954
71+
T 23.72 8.471 41.69
72+
V 21.053 10.121 43.722
73+
A 20.244 6.848 45.58
74+
D 23.921 6.549 46.53
75+
Q 23.872 10.049 47.957
76+
L 20.8 9.651 50.047
77+
I 22.001 6.333 51.437
78+
A 25.492 7.822 52.194
79+
Q 23.74 10.702 54.081
80+
N 21.676 8.217 56.08
81+
H 24.768 6.134 56.84
82+
Y 26.644 9.203 57.99
83+
A 23.826 10.239 60.337
84+
S 23.465 6.734 61.816
85+
N 27.196 6.474 62.335
86+
L 27.336 9.819 64.171
87+
V 24.369 8.807 66.346
88+
D 25.834 5.436 67.258
89+
E 29.307 6.694 67.839
90+
K 28.077 9.514 70.137
91+
R 26.142 6.804 72.015
92+
K 29.251 4.721 72.476
93+
Q 31.204 7.767 73.797
94+
V 28.409 8.608 76.23
95+
L 28.205 4.949 77.433
96+
E 31.978 4.872 77.948
97+
R 31.98 8.098 79.926
98+
W 29.249 6.529 82.085
99+
R 31.385 3.523 82.57
100+
H 34.024 5.735 84.157
101+
L 31.539 7.755 86.235
102+
K 29.984 4.543 87.496
103+
E 33.353 3.27 88.454
104+
G 33.954 6.165 90.738
105+
L 30.502 6.126 92.265
106+
I 30.703 2.45 93.297
107+
E 34.323 2.559 94.435
108+
K 33.83 5.738 96.532
109+
R 30.789 3.887 97.846
110+
S 33.238 1.146 98.961
111+
R 35.696 3.644 100.393
112+
L 33.011 4.864 102.758
113+
G 34.731 2.184 104.901
114+
D 35.966 -1.307 103.936
+169
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
>sp|Q6GZX4|001R_FRG3G Putative transcription factor 001R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-001R PE=4 SV=1
2+
MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS
3+
EKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLD
4+
AKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHL
5+
EKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDD
6+
SFRKIYTDLGWKFTPL
7+
>sp|Q6GZX3|002L_FRG3G Uncharacterized protein 002L OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-002L PE=4 SV=1
8+
MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCAR
9+
IKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSL
10+
AERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADC
11+
KCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNML
12+
DDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRK
13+
VMFFVAGAVLVAILISTVRW
14+
>sp|Q197F8|002R_IIV3 Uncharacterized protein 002R OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-002R PE=4 SV=1
15+
MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWKMNREQALAERYPEL
16+
QTSEPSEDYSGPVESLELLPLEIKLDIMQYLSWEQISWCKHPWLWTRWYKDNVVRVSAIT
17+
FEDFQREYAFPEKIQEIHFTDTRAEEIKAILETTPNVTRLVIRRIDDMNYNTHGDLGLDD
18+
LEFLTHLMVEDACGFTDFWAPSLTHLTIKNLDMHPRWFGPVMDGIKSMQSTLKYLYIFET
19+
YGVNKPFVQWCTDNIETFYCTNSYRYENVPRPIYVWVLFQEDEWHGYRVEDNKFHRRYMY
20+
STILHKRDTDWVENNPLKTPAQVEMYKFLLRISQLNRDGTGYESDSDPENEHFDDESFSS
21+
GEEDSSDEDDPTWAPDSDDSDWETETEEEPSVAARILEKGKLTITNLMKSLGFKPKPKKI
22+
QSIDRYFCSLDSNYNSEDEDFEYDSDSEDDDSDSEDDC
23+
>sp|Q197F7|003L_IIV3 Uncharacterized protein 003L OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-003L PE=4 SV=1
24+
MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGAWFDTSLNARSLTTT
25+
PSLTTCTPPSLAACTPPTSLGMVDSPPHINPPRRIGTLCFDFGSAKSPQRCECVASDRPS
26+
TTSNTAPDTYRLLITNSKTRKNNYGTCRLEPLTYGI
27+
>sp|Q6GZX2|003R_FRG3G Uncharacterized protein 3R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-003R PE=3 SV=1
28+
MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVYQMSNILLTERRQVD
29+
RAMGGSDDDGVMVVALSPSDFKTVLGSALLAVERDMVHVVPKYLQTPGILHDMLVLLTPI
30+
FGEALSVDMSGATDVMVQQIATAGFVDVDPLHSSVSWKDNVSCPVALLAVSNAVRTMMGQ
31+
PCQVTLIIDVGTQNILRDLVNLPVEMSGDLQVMAYTKDPLGKVPAVGVSVFDSGSVQKGD
32+
AHSVGAPDGLVSFHTHPVSSAVELNYHAGWPSNVDMSSLLTMKNLMHVVVAEEGLWTMAR
33+
TLSMQRLTKVLTDAEKDVMRAAAFNLFLPLNELRVMGTKDSNNKSLKTYFEVFETFTIGA
34+
LMKHSGVTPTAFVDRRWLDNTIYHMGFIPWGRDMRFVVEYDLDGTNPFLNTVPTLMSVKR
35+
KAKIQEMFDNMVSRMVTS
36+
>sp|Q6GZX1|004R_FRG3G Uncharacterized protein 004R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-004R PE=4 SV=1
37+
MNAKYDTDQGVGRMLFLGTIGLAVVVGGLMAYGYYYDGKTPSSGTSFHTASPSFSSRYRY
38+
>sp|Q197F5|005L_IIV3 Uncharacterized protein 005L OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-005L PE=3 SV=1
39+
MRYTVLIALQGALLLLLLIDDGQGQSPYPYPGMPCNSSRQCGLGTCVHSRCAHCSSDGTL
40+
CSPEDPTMVWPCCPESSCQLVVGLPSLVNHYNCLPNQCTDSSQCPGGFGCMTRRSKCELC
41+
KADGEACNSPYLDWRKDKECCSGYCHTEARGLEGVCIDPKKIFCTPKNPWQLAPYPPSYH
42+
QPTTLRPPTSLYDSWLMSGFLVKSTTAPSTQEEEDDY
43+
>sp|Q6GZX0|005R_FRG3G Uncharacterized protein 005R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-005R PE=4 SV=1
44+
MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFVKRNTGKRLPIGKRS
45+
NLYVRICDLSGTIYMGETFILESWEELYLPEPTKMEVLGTLESCCGIPPFPEWIVMVGED
46+
QCVYAYGDEEILLFAYSVKQLVEEGIQETGISYKYPDDISDVDEEVLQQDEEIQKIRKKT
47+
REFVDKDAQEFQDFLNSLDASLLS
48+
>sp|Q91G88|006L_IIV6 Putative KilA-N domain-containing protein 006L OS=Invertebrate iridescent virus 6 OX=176652 GN=IIV6-006L PE=3 SV=1
49+
MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGKRFVDWNKTLRSKKL
50+
IQYYETRCDIKTESLLYEIKGDNNDEITKQITGTYLPKEFILDIASWISVEFYDKCNNII
51+
INYFVNEYKTMDKKTLQSKINEVEEKMQKLLNEKEEELQEKNDKIDELILFSKRMEEDRK
52+
KDREMMIKQEKMLRELGIHLEDVSSQNNELIEKVDEQVEQNAVLNFKIDNIQNKLEIAVE
53+
DRAPQPKQNLKRERFILLKRNDDYYPYYTIRAQDINARSALKRQKNLYNEVSVLLDLTCH
54+
PNSKTLYVRVKDELKQKGVVFNLCKVSISNSKINEEELIKAMETINDEKRDV
55+
>sp|Q6GZW9|006R_FRG3G Uncharacterized protein 006R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-006R PE=4 SV=1
56+
MYKMYFLKDQKFSLSGTIRINDKTQSEYGSVWCPGLSITGLHHDAIDHNMFEEMETEIIE
57+
YLGPWVQAEYRRIKG
58+
>sp|Q6GZW8|007R_FRG3G Uncharacterized protein 007R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-007R PE=4 SV=1
59+
MRSIKPLRCCNAHGRHVSQEYGRCTLLLFREKLFLQTGLVCNKQCNAPNNDGAESKHHGI
60+
HHGSRGALALRGAGVHLLASAALGPRVLAGLVPTGRSVQGSVGQCGRVAQIGRARDVAAR
61+
KQESYCEK
62+
>sp|Q197F3|007R_IIV3 Uncharacterized protein 007R OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-007R PE=4 SV=1
63+
MEAKNITIDNTTYNFFKFYNINQPLTNLKYLNSERLCFSNAVMGKIVDDASTITITYHRV
64+
YFGISGPKPRQVADLGEYYDVNELLNYDTYTKTQEFAQKYNSLVKPTIDAKNWSGNELVL
65+
LVGNEWYCKTFGKAGSKNVFLYNMIPTIYRDEPQHQEQILKKFMFFNATKNVEQNPNFLD
66+
NVPEEYYHLLLPKSWVEKNLSDKYRKIMETEHKPLVFSCEPAFSFGLCRNTQDKNESYQL
67+
SLCLYEREKPRDAEIVWAAKYDELAAMVRDYLKKTPEFKKYRSFISCMKGLSWKNNEIGD
68+
KDGPKLYPKVIFNRKKGEFVTIFTKDDDVEPETIEDPRTILDRRCVVQAALRLESVFVHN
69+
KVAIQLRINDVLISEWKEASSKPQPLILRRHRFTKPSSSVAKSTSPSLRNSGSDESDLNQ
70+
SDSDKEDERVVPVPKTKRIVKTVKLPN
71+
>sp|Q197F2|008L_IIV3 Uncharacterized protein 008L OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-008L PE=4 SV=1
72+
MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQNAGDVTNKAYVDQA
73+
VMSAAVPVASSTTVGTIQMAGDLEGSSGTNPIIAANKITLNKLQKIGPKMVIGNPNSDWN
74+
NTQEIELDSSFRIVDNRLNAGIVPISSTDPNKSNTVIPAPQQNGLFYLDSSGRVWVWAEH
75+
YYKCITPSRYISKWMGVGDFQELTVGQSVMWDSGRPSIETVSTQGLEVEWISSTNFTLSS
76+
LYLIPIVVKVTICIPLLGQPDQMAKFVLYSVSSAQQPRTGIVLTTDSSRSSAPIVSEYIT
77+
VNWFEPKSYSVQLKEVNSDSGTTVTICSDKWLANPFLDCWITIEEVG
78+
>sp|Q6GZW6|009L_FRG3G Putative helicase 009L OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-009L PE=4 SV=1
79+
MDTSPYDFLKLYPWLSRGEADKGTLLDAFPGETFEQSLASDVAMRRAVQDDPAFGHQKLV
80+
ETFLSEDTPYRELLLFHAPGTGKTCTVVSVAERAKEKGLTRGCIVLARGAALLRNFLHEL
81+
VFNCGTGGRYIPEGYADMGDQERTRKMRKAVSSYYQFRTYETFAKSVATMSAEAIRARYD
82+
RFVIVMDEVHHLRSVQAEGVNTYSAISRFLRTVRGCVKMLLTGTPMTNEPGELADVLNLI
83+
LPQDKTIRPEDGIFSNSGDLLKPDELAERVRGRVSYLKAARPDAGLTFAGEVLGGTGMTH
84+
LRLVRLEMSAFQSDAYASAWDQDAGDRNIFSNSRQCSLAVMPDRRWGSAAEARNPSQVRR
85+
MAGQNLAEYSVKYDYLVRVASSSPKTFAYCEYVNGSGLSLLSDILLANGWRRATGRETTP
86+
GKRFALLTASQKNIHKIVQRFNHEDNVDGAYISLLLGSRVVAEGLTFKEVRHTVILTPHW
87+
NYTETAQAIARSWRAGSHDRLKARGEAVAVTVHRLVAVPRGRDTPRSIDSDMYAVSEVKD
88+
KRIKAVERILMTSAADCSLLRSRNLYPSEFDGSRECEYGRCAYRCSNVSVEPGPLPALLG
89+
ASAAEAVAQVRLDGGGDPAIMKVDMSTLWAEVTAGRRYVNRWGDGAVLRAEGGRLELSAP
90+
YGSSEEGRWGDFYKTRNLCYAKMDQDHLRADDLRDSLPQEVEELLTVSPVETIGETASAM
91+
PQEVATAILMACVQARADGKTLNVVRRDALLDFYKGFYAMGPSGWTVWLHARGANAKVYD
92+
GRRWNPADEDTLEFLAARSAKFTDTRIGYYGLYNPNLKDFCIRDVTQGKRDKVDLRKLTV
93+
GRRCVDWDQRTLVHIVARLMKIDGRRDFMPHATLREMRELAEQDPLHEPSDLTSKEACRR
94+
FLFWTQKGDNKFRRQDICKAMEKWFIENDLMEDNFDCGHQHKRRGKFA
95+
>sp|Q91G85|009R_IIV6 Uncharacterized protein 009R OS=Invertebrate iridescent virus 6 OX=176652 GN=IIV6-009R PE=3 SV=1
96+
MIKLFCVLAAFISINSACQSSHQQREEFTVATYHSSSICTTYCYSNCVVASQHKGLNVES
97+
YTCDKPDPYGRETVCKCTLIKCHDI
98+
>sp|Q6GZW5|010R_FRG3G Uncharacterized protein 010R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-010R PE=4 SV=1
99+
MKMDTDCRHWIVLASVPVLTVLAFKGEGALALAGLLVMAAVAMYRDRTEKKYSAARAPSP
100+
IAGHKTAYVTDPSAFAAGTVPVYPAPSNMGSDRFEGWVGGVLTGVGSSHLDHRKFAERQL
101+
VDRREKMVGYGWTKSFF
102+
>sp|Q197E9|011L_IIV3 Uncharacterized protein 011L OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-011L PE=4 SV=1
103+
MMESPKYKKSTCSVTNLGGTCILPQKGATAPKAKDVSPELLVNKMDNLCQDWARTRNEYN
104+
KVHIEQAPTDSYFGVVHSHTPKKKYTSRDSDSEPEATSTRRSATAQRAANLKSSPVDQWS
105+
TTPPQPQPQPAAPTVKKTCASSPPAALSVKRTCTSPPPPPVLIDDDTGEDAFYDTNDPDI
106+
FYDIENGVSELETEGPKRPVYYQRNIRYPIDGSVPQESEQWYDPIDDEFLASSGDVVSLE
107+
PSPIAAFQPTPPKTVQFVPMPEEIIVPPPPPPKTVVDEGVQAMPYTVDQMIQTDFEESPL
108+
LANVNLRTIPIEEVNPNFSPVLMQDMVRDSFVFGTVAQRVMASQRVKQFFKELIEQDVSL
109+
AGRMCMDSGSPQLNLYNSLMGVKLLYRWRSSTTFYRAIVPEIDEPVQVMQDVLSSSEWAK
110+
FDSQAGIPPKMVYIHYKLLNDLVKTLICPNFQLTHAALVCVDCRPEAVGSDGLQDGRQRR
111+
CSNLVSEYHEMTLEDLFNTIKPADLNAKNIILSVLFQMLYAVATVQKQFGMGGLFANADS
112+
VHVRRIQPGGFWHYTVNGLRYSVPNYGYLVILTNFTDVVNYRPDFATTRYFGRRQAKVVP
113+
TRNWYKFVPFTTRYRPFVTVDPITQAKTTAYAPNPPTEGITINEFYKDSSDLRPSVPVDL
114+
NDMITFPVPEFHLTICRLFSFFSKFYDSNFIGNDPFVRNLVDRYSQPFEFPDVYWPEDGV
115+
SRVLACYTIEEIYPNWVDGDTDYVIESYNLD
116+
>sp|Q6GZW4|011R_FRG3G Uncharacterized protein 011R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-011R PE=4 SV=1
117+
MTSVKTIAMLAMLVIVAALIYMGYRTFTSMQSKLNELESRVNAPQLRPPVMSPIVPLNFI
118+
ESEDLDKELD
119+
>sp|Q6GZW3|012L_FRG3G Uncharacterized protein 012L OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-012L PE=4 SV=1
120+
MCAKLVEMAFGPVNADSPPLTAEEKESAVEKLVGSKPFPALKKKYHDKVPAQDPKYCLFS
121+
FVEVLPSCDIKAAGAEEMCSCCIKRRRGQVFGVACVRGTAHTLAKAKQKADKLVGDYDSV
122+
HVVQTCHVGRPFPLVSSGMAQETVAPSAMEAAEAAMDAKSAEKRKERMRQKLEMRKREQE
123+
IKARNRKLLEDPSCDPDAEEETDLERYATLRVKTTCLLENAKNASAQIKEYLASMRKSAE
124+
AVVAMEAADPTLVENYPGLIRDSRAKMGVSKQDTEAFLKMSSFDCLTAASELETMGF
125+
>sp|Q197E7|013L_IIV3 Uncharacterized protein IIV3-013L OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-013L PE=4 SV=1
126+
MYYRDQYGNVKYAPEGMGPHHAASSSHHSAQHHHMTKENFSMDDVHSWFEKYKMWFLYAL
127+
ILALIFGVFMWWSKYNHDKKRSLNTASIFY
128+
>sp|Q6GZW2|013R_FRG3G Uncharacterized protein 013R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-013R PE=4 SV=1
129+
MANSVAFSSMTWYSPLASDNLYDICVDKVHNRVLCLCHSFGCCTNAVVIWILPSFDEFTP
130+
QTLSCKGP
131+
>sp|Q6GZW1|014R_FRG3G Uncharacterized protein 014R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-014R PE=4 SV=1
132+
METLVQAYLDIQGKIAEFRREIKALRVEEKAITANLFEAMGEAGVESIRISEDRYLVAEE
133+
KPKRTRSKQQFYQAAEGEGFTQEDVDRLMSLSRGAVTGSSSNVKIRKSAPARNEEDDDG
134+
>sp|Q6GZW0|015R_FRG3G Uncharacterized protein 015R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-015R PE=4 SV=1
135+
MEQVPIKEMRLSDLRPNNKSIDTDLGGTKLVVIGKPGSGKSTLIKALLDSKRHIIPCAVV
136+
ISGSEEANGFYKGVVPDLFIYHQFSPSIIDRIHRRQVKAKAEMGSKKSWLLVVIDDCMDN
137+
AKMFNDKEVRALFKNGRHWNVLVVIANQYVMDLTPDLRSSVDGVFLFRENNVTYRDKTYA
138+
NFASVVPKKLYPTVMETVCQNYRCMFIDNTKATDNWHDSVFWYKAPYSKSAVAPFGARSY
139+
WKYACSKTGEEMPAVFDNVKILGDLLLKELPEAGEALVTYGGKDGPSDNEDGPSDDEDGP
140+
SDDEEGLSKDGVSEYYQSDLDD
141+
>sp|Q6GZV8|017L_FRG3G Uncharacterized protein 017L OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-017L PE=4 SV=1
142+
METMSDYSKEVSEALSALRGELSALSAAISNTVRAGSYSAPVAKDCKAGHCDSKAVLKSL
143+
SRSARDLDSAVEAVSSNCEWASSGYGKQIARALRDDAVRVKREVESTRDAVDVVTPSCCV
144+
QGLAEEAGKLSEMAAVYRCMATVFETADSHGVREMLAKVDGLKQTMSGFKRLLGKTAEID
145+
GLSDSVIRLGRSIGEVLPATEGKAMRDLVKQCERLNGLVVDGSRKVEEQCSKLRDMASQS
146+
YVVADLASQYDVLGGKAQEALSASDALEQAAAVALRAKAAADAVAKSLDSLDVKKLDRLL
147+
EQASAVSGLLAKKNDLDAVVTSLAGLEALVAKKDELYKICAAVNSVDKSKLELLNVKPDR
148+
LKSLTEQTVVVSQMTTALATFNEDKLDSVLGKYMQMHRFLGMATQLKLMSDSLAEFQPAK
149+
MAQMAAAASQLKDFLTDQTVSRLEKVSAAVDATDVTKYASAFSDGGMVSDMTKAYETVKA
150+
FAAVVNSLDSKKLKLVAECAKK
151+
>sp|Q6GZV7|018L_FRG3G Uncharacterized protein 018L OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-018L PE=3 SV=1
152+
MQNSKTDMCAALWAVTGLVLNVAVRFALEPFKESMGQGWHTAARVAVNGAIVLALADRLS
153+
DSPVTMTLFVMALSASPE
154+
>sp|Q6GZV6|019R_FRG3G Putative serine/threonine-protein kinase 019R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-019R PE=3 SV=1
155+
MATNYCDEFERNPTRNPRTGRTIKRGGPVFRALERECSDGAARVFPAAAVRGAAAARAAS
156+
PRVAAASPCPEFARDPTRNPRTGRPIKRGGPVFRALERECADYGGASPRRVSPARAFPNR
157+
RVSPARRQSPAEAAEASPCPEFARDPTRNPRTGRTIKRGGPTYRALEAECADYGRLSPIR
158+
SPWSDWSSTGLSPFRSHMRKSPARRSPARRSPARRSLARYTEHLTSDSETEVDYDARNVI
159+
RSQVGPGGVCERFAADPTRNPVTGSPLSRNDPLYTDLMEICKGYPDTPLTKSLTGEGTDD
160+
DTCEAFCRDPTRNPVTGQKMRRNGIEYQMFAEECDCSGISRPSGVSRTSGTSGSSGSSAS
161+
SRPPNSFEAPGASSRPPNSFEASGAARVPGTPSVSRGEPRWMSSISTRHNYDESNPMSVA
162+
FRLRHVKDIRKFLRTVRPGRSGFCATDKGGWLGSAAVSDNVIGQGSWGSVHMVKFRDFPE
163+
EFVVKEAVLMSVSEKHRYKPTVVWDEWAAGSVPDEVVVNNMVTEIAATGMTPFVPLTAGA
164+
GACDSCNPQLLEKAAKVTKCYLQAMEAADFSLDRVLPTMSPDQAASALAQILLGLQSLQT
165+
TLGIMHNDIKAHNILVKRVPPGGYWKVTDSFNGQVFYIPNEGYLCMLADYGVVRLVKPAV
166+
GMDTLYGTRNARFVPRDVGRWGKGAGTEYVVTPIRSKISVVVRGGRFVGVEPNKAVRYWK
167+
NTDTSKVGDVITTNNVFYMGYDIEPDMQVQLDDTNSFPVWESRGDVADCVRTFVGGKRAS
168+
QPGFHRLFYKKTGSAWEKAAETVAKQNPLFSGFTLDGSGLKYIRAATACAYIFPGMAVPR
169+
PGEREIESFTM
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#include <iostream>
2+
#include <string>
3+
#include <vector>
4+
#include <sstream>
5+
#include <sqlite3.h>
6+
7+
#define CHUNK_SIZE 10000
8+
9+
struct Record {
10+
std::string id;
11+
std::string sequence;
12+
};
13+
14+
void chunked_insert(sqlite3* db, std::vector<Record>& records) {
15+
char* zErrMsg = 0;
16+
sqlite3_exec(db, "BEGIN TRANSACTION;", NULL, NULL, &zErrMsg);
17+
18+
for(auto& record : records){
19+
std::string sql = "INSERT INTO sequences (id, sequence) VALUES ('" + record.id + "', '" + record.sequence + "');";
20+
sqlite3_exec(db, sql.c_str(), NULL, NULL, &zErrMsg);
21+
}
22+
23+
sqlite3_exec(db, "COMMIT;", NULL, NULL, &zErrMsg);
24+
records.clear();
25+
}
26+
27+
int main() {
28+
std::ios_base::sync_with_stdio(false);
29+
std::cin.tie(NULL);
30+
sqlite3* db;
31+
sqlite3_open("uniprotkb/uniprot_sequences.db", &db);
32+
33+
char* zErrMsg = 0;
34+
sqlite3_exec(db, "CREATE TABLE IF NOT EXISTS sequences (id TEXT PRIMARY KEY, sequence TEXT NOT NULL);", NULL, NULL, &zErrMsg);
35+
// sqlite3_exec(db, "CREATE INDEX IF NOT EXISTS idx_id ON sequences (id);", NULL, NULL, &zErrMsg);
36+
// sqlite3_exec(db, "CREATE INDEX IF NOT EXISTS idx_sequence ON sequences (sequence);", NULL, NULL, &zErrMsg);
37+
38+
std::vector<Record> records;
39+
std::string line, uniprot_id, sequence;
40+
while(std::getline(std::cin, line)) {
41+
if(line[0] == '>') {
42+
if(!uniprot_id.empty() && !sequence.empty()) {
43+
records.push_back({uniprot_id, sequence});
44+
sequence.clear();
45+
46+
if(records.size() >= CHUNK_SIZE){
47+
chunked_insert(db, records);
48+
}
49+
}
50+
51+
// Extracting Uniprot ID
52+
int first_pipe = line.find("|");
53+
int second_pipe = line.find("|", first_pipe + 1);
54+
uniprot_id = line.substr(first_pipe + 1, second_pipe - first_pipe - 1);
55+
} else {
56+
sequence += line;
57+
}
58+
}
59+
60+
if(!uniprot_id.empty() && !sequence.empty()){
61+
records.push_back({uniprot_id, sequence});
62+
}
63+
64+
if(!records.empty()){
65+
chunked_insert(db, records);
66+
}
67+
68+
sqlite3_close(db);
69+
return 0;
70+
}

‎cpp_scripts/post_process_kmers/inputf.txt

Whitespace-only changes.

‎cpp_scripts/post_process_kmers/outputf.txt

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
#include <iostream>
2+
#include <fstream>
3+
#include <string>
4+
#include <sstream>
5+
#include <unordered_map>
6+
#include <filesystem>
7+
#include <vector>
8+
#include <algorithm>
9+
#include <map>
10+
#include <iomanip>
11+
12+
struct PdbInfo {
13+
std::string pdb_id;
14+
double resolution;
15+
int sequence_length;
16+
};
17+
18+
namespace fs = std::filesystem;
19+
20+
std::unordered_map<std::string, int> global_kmers;
21+
bool process_all_pdbs = false;
22+
int kmer_size = 12;
23+
24+
std::vector<std::string> readUniprotFiles(const fs::path& uniprot_path) {
25+
std::vector<std::string> file_list;
26+
for(const auto& entry : fs::directory_iterator(uniprot_path)) {
27+
file_list.push_back(entry.path().string());
28+
}
29+
return file_list;
30+
}
31+
32+
PdbInfo parseLine(const std::string& line) {
33+
std::istringstream iss(line);
34+
PdbInfo info;
35+
iss >> info.pdb_id >> info.resolution >> info.sequence_length;
36+
return info;
37+
}
38+
39+
PdbInfo selectPdb(const std::vector<PdbInfo>& pdb_infos) {
40+
std::vector<PdbInfo> sorted_infos = pdb_infos;
41+
std::sort(sorted_infos.begin(), sorted_infos.end(), [](const PdbInfo& a, const PdbInfo& b) {
42+
return a.resolution < b.resolution || (a.resolution == b.resolution && a.sequence_length > b.sequence_length);
43+
});
44+
45+
for (size_t i = 0; i < sorted_infos.size() - 1; ++i) {
46+
if (static_cast<double>(sorted_infos[i].sequence_length) >= 0.8 * sorted_infos[i + 1].sequence_length) {
47+
return sorted_infos[i];
48+
}
49+
}
50+
return sorted_infos.back();
51+
}
52+
53+
PdbInfo parseInfoFile(const std::string& file_path) {
54+
std::ifstream file(file_path);
55+
std::string line;
56+
std::vector<PdbInfo> pdb_infos;
57+
while(std::getline(file, line)) {
58+
pdb_infos.push_back(parseLine(line));
59+
}
60+
61+
return selectPdb(pdb_infos);
62+
}
63+
64+
void parseKmersFile(const std::string& file_path) {
65+
std::ifstream file(file_path);
66+
std::string line;
67+
while(std::getline(file, line)) {
68+
if(line.length() >= kmer_size) {
69+
global_kmers[line.substr(0, kmer_size)]++;
70+
}
71+
}
72+
}
73+
74+
int main(int argc, char** argv) {
75+
for(int i = 1; i < argc; i++) {
76+
std::string arg = argv[i];
77+
if(arg == "-a") {
78+
process_all_pdbs = true;
79+
} else if(arg == "-k" && i + 1 < argc) {
80+
kmer_size = std::stoi(argv[++i]);
81+
} else if(arg == "-h" || arg == "--help") {
82+
std::cout << "Usage: " << argv[0] << " [OPTIONS]\n"
83+
<< "Options:\n"
84+
<< " -a Process all PDBs\n"
85+
<< " -k <value> Specify the size of the k-mers\n"
86+
<< " -h, --help Display this help message and exit\n";
87+
return 0;
88+
}
89+
}
90+
91+
fs::path uniprot_path = "./pdb_output/uniprot";
92+
fs::path pdbs_path = "./pdb_output/pdbs";
93+
std::vector<std::string> file_list;
94+
95+
if(process_all_pdbs) {
96+
for(const auto& entry : fs::directory_iterator(pdbs_path)) {
97+
file_list.push_back(entry.path().c_str());
98+
}
99+
} else {
100+
file_list = readUniprotFiles(uniprot_path);
101+
}
102+
103+
int total_files = file_list.size();
104+
105+
for(int i = 0; i < total_files; ++i) {
106+
if(process_all_pdbs) {
107+
parseKmersFile(file_list[i]);
108+
} else {
109+
PdbInfo selected_pdb = parseInfoFile(file_list[i]);
110+
111+
std::string kmers_file_path = (pdbs_path / (selected_pdb.pdb_id + ".kmers")).c_str();
112+
if(fs::exists(kmers_file_path)) {
113+
parseKmersFile(kmers_file_path);
114+
}
115+
}
116+
117+
if((i + 1) % 100 == 0 || i + 1 == total_files) {
118+
std::cerr << "\rProcessed " << (i + 1) << " / " << total_files << "; "
119+
<< std::fixed << std::setprecision(2) << static_cast<double>(i + 1) / total_files * 100 << "%";
120+
std::cerr.flush();
121+
}
122+
}
123+
124+
std::cerr << std::endl << "Prepairing results..." << std::endl;
125+
126+
std::vector<std::pair<std::string, int>> sorted_kmers(global_kmers.begin(), global_kmers.end());
127+
128+
std::sort(sorted_kmers.begin(), sorted_kmers.end(), [](const auto& a, const auto& b) {
129+
return a.second > b.second;
130+
});
131+
132+
for(const auto& [kmer, freq] : sorted_kmers) {
133+
std::cout << kmer << " " << freq << std::endl;
134+
}
135+
136+
return 0;
137+
}

‎kmers/__init__.py

Whitespace-only changes.

‎kmers/calculate_kmer.py

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from sklearn.neighbors import KDTree
2+
3+
from kmers.pdb_data import PDBData
4+
5+
import networkx as nx
6+
7+
8+
def calculate_kmers(pdb_data: PDBData, generate_graph: bool = False) -> list[str] or tuple[list[str], nx.Graph]:
9+
"""
10+
Takes in a file of experimental data in the format <AA> <x> <y> <z>
11+
and constructs proximity based k-mers for each, within 15 angstroms.
12+
13+
Uses a KDTree-based approach
14+
"""
15+
residues = pdb_data.residue_list
16+
coordinates = pdb_data.coordinates
17+
18+
indices, distances = _nearest_neighbours(residues, coordinates)
19+
20+
# if e.g. a graph is needed, this is the place to do it
21+
# something like, maybe this works?
22+
graph = nx.Graph()
23+
if generate_graph:
24+
for residue_idx, (ind, dist) in enumerate(zip(indices, distances)):
25+
for neighbor_idx, distance in zip(ind, dist):
26+
if residue_idx != neighbor_idx: # avoid self-connections
27+
graph.add_edge(residue_idx, neighbor_idx, weight=distance)
28+
29+
# nx.write_graphml(graph, "residues.graphml")
30+
31+
closest_letters = [''.join([residues[i] for i in ind]) for ind in indices]
32+
33+
if generate_graph:
34+
return closest_letters, graph
35+
36+
return closest_letters
37+
38+
39+
def _nearest_neighbours(_residues, coordinates, search_radius=15):
40+
"""
41+
Returns 2 lists, each containing lists of indices of residues and distances.
42+
For every residue, the list will be a sorted list of all residues within the search radius.
43+
default=15 angstroms
44+
"""
45+
tree = KDTree(coordinates)
46+
indices, distances = tree.query_radius(coordinates, r=search_radius, return_distance=True, sort_results=True)
47+
48+
return indices, distances

‎kmers/pdb_data.py

+116
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
2+
"""
3+
PDBData takes a byte stream (produces by extract_pdb_coordinates) and parses it into a PDBData object.
4+
5+
The format of the byte stream is as follows:
6+
1. First line is resolution:
7+
'resolut: {resolution}' (float)
8+
2. Second line is uniprot_id. Can be comma separated if multiple uniprot_ids are found:
9+
'uniprot: {uniprot_id}' (str)
10+
3. Third line is matched sequence. This is the sequence that is found in the PDB file (SEQRES)
11+
'matched: {sequence}' (str)
12+
4. Fourth line is parsed sequence. These are the residues that are found in the PDB file (ATOM)
13+
'parsed: {parsed_sequence}' (str)
14+
5. Lines 5 to n are other sequences that are found in the PDB file (SEQRES)
15+
'other: {other_sequence}' (str)
16+
6. Lines n+1 is a blank line
17+
7. Lines n+2 to m are the coordinates of the PDB file (ATOM)
18+
'{residue_name [e.g. A/E/M]} {x} {y} {z}' (char, float, float, float)
19+
"""
20+
21+
22+
class PDBData:
23+
"""Takes in a byte stream"""
24+
def __init__(self, pdb_byte_stream):
25+
self._pdb_id = None
26+
self._resolution = None
27+
self._parsed_sequence = None
28+
self._first_residue_number = None
29+
self._residue_list = []
30+
self._coordinates = []
31+
32+
self._input_uniprot_ids = []
33+
self._input_matched_sequence = None
34+
self._input_other_sequences = []
35+
36+
self._parse(pdb_byte_stream)
37+
38+
def _parse(self, pdb_byte_stream):
39+
lines = pdb_byte_stream.decode('utf-8').split("\n")
40+
41+
# Parse pdb id
42+
self._pdb_id = lines[0].split(': ')[1]
43+
44+
# Parse the resolution
45+
self._resolution = float(lines[1].split(': ')[1])
46+
47+
# Parse the uniprot ids
48+
self._input_uniprot_ids = lines[2].split(': ')[1].split(',')
49+
50+
# Parse the matched sequence
51+
self._input_matched_sequence = lines[3].split(': ')[1]
52+
# if self._input_matched_sequence == 'N/A':
53+
# print('N/A')
54+
55+
# Parse the parsed sequence
56+
self._parsed_sequence = lines[4].split(': ')[1]
57+
58+
# Parse the other sequences
59+
idx = 5
60+
while lines[idx].startswith('other: '):
61+
self._input_other_sequences.append(lines[idx].split(': ')[1])
62+
idx += 1
63+
64+
# parse the first residue number (initres)
65+
self._first_residue_number = int(lines[idx].split(': ')[1])
66+
67+
# Parse the coordinates
68+
idx += 2 # skip the initres and blank line
69+
while idx < len(lines) and lines[idx].strip(): # Ensure we are not at the end or at a blank line
70+
parts = lines[idx].split()
71+
residue_name = parts[0]
72+
x, y, z = map(float, parts[1:])
73+
self._residue_list.append(residue_name)
74+
self._coordinates.append((x, y, z))
75+
idx += 1
76+
77+
@property
78+
def pdb_id(self):
79+
return self._pdb_id
80+
81+
@property
82+
def resolution(self):
83+
return self._resolution
84+
85+
@property
86+
def first_residue_number(self):
87+
return self._first_residue_number
88+
89+
@property
90+
def residue_sequence_parsed(self):
91+
"""
92+
The parsed sequence is the sequence extracted from the PDB ATOM records.
93+
:return:
94+
"""
95+
return self._parsed_sequence
96+
97+
@property
98+
def residue_sequence_matched(self):
99+
"""
100+
The matched sequence is the sequence extracted from the PDB SEQRES records,
101+
of which the parsed sequence is a substring of.
102+
:return:
103+
"""
104+
return self._input_matched_sequence
105+
106+
@property
107+
def uniprot_ids(self):
108+
return self._input_uniprot_ids
109+
110+
@property
111+
def residue_list(self):
112+
return self._residue_list
113+
114+
@property
115+
def coordinates(self):
116+
return self._coordinates

‎kmers/pdb_gz_processor.py

+160
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
import gzip
2+
import sqlite3
3+
import subprocess
4+
import time
5+
from pathlib import Path
6+
7+
from kmers.calculate_kmer import calculate_kmers
8+
from kmers.pdb_data import PDBData
9+
10+
11+
class GZProcessor:
12+
def __init__(self, db_path, process_dir, out_uniprot_dir, out_pdbs_dir, handle_all_pdbs):
13+
self.db_path = db_path
14+
self.process_dir = process_dir
15+
self.out_uniprot_dir = out_uniprot_dir
16+
self.out_pdbs_dir = out_pdbs_dir
17+
self.handle_all_pdbs = handle_all_pdbs
18+
19+
if not self.handle_all_pdbs:
20+
self.conn = sqlite3.connect(f'file:{self.db_path}?mode=ro', uri=True)
21+
22+
self.codes = {'SUCCESS': 0}
23+
self.max_pdb_count = 1 # to avoid division by zero
24+
self.cur_pdb_count = 0
25+
26+
def process_gz_file(self, gz_file):
27+
28+
# 1. extract coordinates
29+
try:
30+
parsed_pdb = self.extract_coordinates(gz_file)
31+
self.codes['SUCCESS'] += 1
32+
except Exception as e:
33+
self.codes[str(e)] = self.codes.get(str(e), 0) + 1
34+
return
35+
36+
# 2. parse data
37+
pdb_data = PDBData(parsed_pdb)
38+
39+
# 3. find matching uniprot entry, reject if not found
40+
if not self.handle_all_pdbs:
41+
uniprot_id = self.get_matching_uniprot_entry(pdb_data)
42+
if uniprot_id is None:
43+
self.codes['SUCCESS'] -= 1
44+
self.codes['NO_UNIPROT_ID'] = self.codes.get('NO_UNIPROT_ID', 0) + 1
45+
return
46+
47+
# print(f'{pdb_id} -> {uniprot_id}')
48+
49+
kmers = calculate_kmers(pdb_data)
50+
# 4. write data to pdb & uniprot files
51+
self._write_pdb_file(pdb_data.pdb_id, kmers)
52+
if not self.handle_all_pdbs:
53+
self._append_to_uniprot_file(uniprot_id, pdb_data.pdb_id, pdb_data) # noqa
54+
55+
# 5. pass kmers to natural set parser
56+
# TODO
57+
58+
@staticmethod
59+
def extract_coordinates(gz_file):
60+
proc = subprocess.Popen(['bin/extract_pdb_coordinates'],
61+
stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
62+
with gzip.open(gz_file, 'r') as f_in:
63+
pdb_str = f_in.read()
64+
65+
parsed_pdb, err = proc.communicate(input=pdb_str)
66+
67+
if proc.returncode != 0:
68+
raise Exception(err.decode('utf-8').splitlines()[0])
69+
70+
return parsed_pdb
71+
72+
def process_files(self):
73+
"""
74+
Process all files in the process_dir
75+
:return:
76+
"""
77+
self.max_pdb_count = count_files(self.process_dir, '*.ent.gz')
78+
print(f'Processing {self.max_pdb_count} PDB files files...')
79+
80+
time_start = time.time()
81+
82+
for gz_file in Path(self.process_dir).rglob('*.ent.gz'):
83+
self.process_gz_file(gz_file)
84+
self.cur_pdb_count += 1
85+
86+
if self.cur_pdb_count % 100 == 0:
87+
self.print_progress()
88+
89+
self.print_progress()
90+
time_end = time.time()
91+
92+
self.print_codes()
93+
print(f'\nCompleted in {time_end - time_start:.2f} seconds')
94+
95+
def print_progress(self):
96+
print(f'\r{self.cur_pdb_count:<{len(str(self.max_pdb_count))}} / {self.max_pdb_count}, '
97+
f'{self.cur_pdb_count / self.max_pdb_count:.1%}', end='')
98+
99+
def print_codes(self):
100+
print()
101+
for k, v in self.codes.items():
102+
print(f'{k}: {v}')
103+
104+
def get_matching_uniprot_entry(self, pdb_data):
105+
"""
106+
Fetch sequences by uniprot ids first and then check for sequence match.
107+
"""
108+
sequence = pdb_data.residue_sequence_parsed
109+
pdb_uniprot_ids = pdb_data.uniprot_ids
110+
first_residue_number = pdb_data.first_residue_number
111+
112+
cur = self.conn.cursor()
113+
114+
# Prepare IDs for the query
115+
placeholders = ', '.join(['?'] * len(pdb_uniprot_ids))
116+
query = f'SELECT id, sequence FROM sequences WHERE id IN ({placeholders})'
117+
118+
cur.execute(query, tuple(pdb_uniprot_ids))
119+
ids_and_sequences = cur.fetchall()
120+
121+
if len(ids_and_sequences) == 0:
122+
return None
123+
124+
matched_uniprot_ids = []
125+
# Check for exact match if first_residue_number is not 0
126+
if first_residue_number != 0:
127+
# Find sequences that exactly match the input sequence starting and ending at the given points
128+
matched_uniprot_ids = [entry[0] for entry in ids_and_sequences if sequence ==
129+
entry[1][first_residue_number - 1:first_residue_number + len(sequence) - 1]]
130+
131+
# If no exact matches or first_residue_number is 0, find sequences that have the input sequence as a substring
132+
if not matched_uniprot_ids:
133+
matched_uniprot_ids = [entry[0] for entry in ids_and_sequences if sequence in entry[1]]
134+
135+
all_matches = [x for x in pdb_uniprot_ids if x in matched_uniprot_ids]
136+
137+
# if len(all_matches) > 1:
138+
# print(f'Found multiple matches for {sequence}: {all_matches}')
139+
return all_matches[0] if len(all_matches) > 0 else None
140+
141+
def _write_pdb_file(self, pdb_id: str, kmers: list[str]):
142+
with open(f'{self.out_pdbs_dir}/{pdb_id}.kmers', 'w') as f_out:
143+
for kmer in kmers:
144+
f_out.write(f'{kmer}\n')
145+
146+
def _append_to_uniprot_file(self, uniprot_id: str, pdb_id: str, pdb_data: PDBData):
147+
if not Path(f'{self.out_uniprot_dir}/{uniprot_id}.info').exists():
148+
with open(f'{self.out_uniprot_dir}/{uniprot_id}.info', 'w') as f_out:
149+
f_out.write(f'>{uniprot_id}\n')
150+
151+
with open(f'{self.out_uniprot_dir}/{uniprot_id}.info', 'a') as f_out:
152+
f_out.write(f'{pdb_id} {pdb_data.resolution} {len(pdb_data.residue_sequence_parsed)} '
153+
f'{pdb_data.residue_sequence_parsed}\n')
154+
155+
156+
def count_files(directory='.', extension='*'):
157+
count = 0
158+
for _ in Path(directory).rglob(extension):
159+
count += 1
160+
return count

‎kmers/pipeline.py

+126
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import argparse
2+
import os
3+
from pathlib import Path
4+
5+
from kmers.pdb_gz_processor import GZProcessor
6+
7+
"""
8+
- 1. OS walk through all PDB gz files
9+
- 2. Extract the gz files
10+
- 3. extract the PDB coordinates annotated with CA (with C++ script) (PDB_ID.coor)
11+
- 3.1. If unsuccessful (return 1), delete extracted file & created file, next file
12+
- 3.2 If successful, delete extracted file & continue to 4.
13+
- 4. Calculate the Kmers
14+
5. Write them to file (PDB_ID.kmers)
15+
6. Repeat for all files
16+
17+
After batch processing all files:
18+
# 7. Associate each PDB with a protein from the DB
19+
# 8. Select the PDB with the highest resolution
20+
...
21+
22+
9. Read all Kmers into memory (set), with frequency information
23+
9.1. Truncate to 12 residues
24+
9.2. Discard ones which are shorter
25+
10. Save natural set where n=12 to disk, along with frequencies
26+
11. Generate synthetic set. (all possible Kmers of n=12)
27+
12. Calculate difference
28+
13. Extract statistics on freq. dist of natural set, fraction appearing in synthetic set, etc.
29+
"""
30+
31+
32+
standard_error_codes = {
33+
"RESOLUTION_TOO_LOW",
34+
"MISSING_NON_TERMINAL_RESIDUES",
35+
"IS_NOT_PROTEIN",
36+
"NO_ALPHA_CARBON_ATOMS_FOUND",
37+
"EXCLUDE_SELENOCYSTEINE_AND_PYRROLYSINE",
38+
"NO_UNIPROT_ID"
39+
}
40+
41+
42+
def check_empty_directory(directory):
43+
if not os.path.exists(directory):
44+
return
45+
46+
if os.listdir(directory):
47+
raise RuntimeError(f"Error: Output directory '{directory}' is not empty. Aborting.")
48+
49+
50+
def check_file_exists(file_path):
51+
if not os.path.exists(file_path):
52+
raise RuntimeError(f"Error: File '{file_path}' does not exist. Aborting.")
53+
54+
if not os.path.isfile(file_path):
55+
raise RuntimeError(f"Error: Path '{file_path}' is not a file. Aborting.")
56+
57+
58+
def check_directory_exists(directory_path):
59+
if not os.path.exists(directory_path):
60+
raise RuntimeError(f"Error: Directory '{directory_path}' does not exist. Aborting.")
61+
62+
if not os.path.isdir(directory_path):
63+
raise RuntimeError(f"Error: Path '{directory_path}' is not a directory. Aborting.")
64+
65+
66+
def check_directory_contains_files(directory_path, extension=""):
67+
for f in Path(directory_path).rglob(f"*{extension}"):
68+
if f.is_file():
69+
return True
70+
if extension == "":
71+
raise RuntimeError(f"Error: Directory '{directory_path}' does not contain any files. Aborting.")
72+
else:
73+
raise RuntimeError(f"Error: Directory '{directory_path}' does not contain any files with extension "
74+
f"'{extension}'. Aborting.")
75+
76+
77+
if __name__ == '__main__':
78+
parser = argparse.ArgumentParser(description='Process PDB files.')
79+
parser.add_argument('--handle_all_pdbs', required=True, type=bool,
80+
help='Set to True to handle all PDBs without checking for uniprot IDs')
81+
args = parser.parse_args()
82+
83+
if args.handle_all_pdbs not in [True, False]:
84+
raise ValueError("The --handle_all_pdbs argument must be set to either True or False.")
85+
86+
db_path = os.path.expanduser('uniprotkb/uniprot_sequences.db')
87+
88+
if not args.handle_all_pdbs: # Only check for database if we need it
89+
try:
90+
check_file_exists(db_path)
91+
except RuntimeError:
92+
print("Error: Database of uniprot sequences not found. Run ./prepare.sh after downloading"
93+
" uniprot_sprot.fasta.gz and (optionally) uniprot_trembl.fasta.gz into the uniprotkb dir."
94+
"Disk space for sprot only is ~400MB, or ~250GB for both.")
95+
exit(1)
96+
97+
process_dir = 'pdb'
98+
try:
99+
check_directory_exists(process_dir)
100+
check_directory_contains_files(process_dir, extension='.ent.gz')
101+
except RuntimeError:
102+
print("Error: Directory 'pdb' not found or does not contain any .ent.gz files. "
103+
"It should contain e.g. pdb/a0/1a00.ent.gz, pdb/a0/1a01.ent.gz, etc."
104+
"Download PDB files from ftp://ftp.wwpdb.org/pub/pdb/data/structures/divided/pdb/ "
105+
"and extract them into the pdb directory."
106+
"A mirror can be found at https://pycom.brunel.ac.uk/misc/ (42GB tar file * 2 = 84GB)")
107+
exit(1)
108+
109+
output_dir = 'pdb_output'
110+
try:
111+
check_empty_directory(output_dir)
112+
except RuntimeError:
113+
print("Error: Output directory 'pdb_output' is not empty. Aborting."
114+
"If you want to re-run the script, delete the directory first.")
115+
exit(1)
116+
117+
if not os.path.exists(output_dir):
118+
os.makedirs(output_dir)
119+
os.makedirs(os.path.join(output_dir, 'uniprot'))
120+
os.makedirs(os.path.join(output_dir, 'pdbs'))
121+
122+
out_uniprot = os.path.join(output_dir, 'uniprot')
123+
out_pdbs = os.path.join(output_dir, 'pdbs')
124+
125+
processor = GZProcessor(db_path, process_dir, out_uniprot, out_pdbs, args.handle_all_pdbs)
126+
processor.process_files()

‎pdb/README.md

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# PDB Folder
2+
3+
This folder should contain compressed PDB files in the following format:
4+
5+
(dividing not required, can be in the same directry)
6+
```
7+
pdb/a0/1a00.ent.gz
8+
pdb/a0/1a01.ent.gz
9+
...
10+
```
11+
12+
These can be downloaded downloaded from the official hosts / mirrors for PDB:
13+
14+
Official website describing how to download:
15+
https://www.rcsb.org/docs/programmatic-access/file-download-services
16+
17+
Main mirror:
18+
https://files.wwpdb.org/pub/pdb/data/structures/divided/pdb/
19+
20+
Other mirrors:
21+
https://www.wwpdb.org/ftp/pdb-ftp-sites
22+
23+
Alternatively, a tarball of all pdbs (as of 2023-07-28) is hosted on https://pycom.brunel.ac.uk/misc/pdb_2023-07-28.tar (42GB large, needs to be uncompressed, so 84GB temporary storage needed)

‎prepare.sh

+96
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/bin/bash
2+
3+
mdir=$(dirname $(realpath "$0"))
4+
cd "$mdir"
5+
6+
# List of uniprot files.
7+
# SwissProt only: 92 MB .gz file to 250 MB database
8+
files=("uniprotkb/uniprot_sprot.fasta.gz")
9+
10+
# SwissProt+TrEMBL: 62 GB .gz files to ~191 GB database
11+
# files=("uniprotkb/uniprot_sprot.fasta.gz", "uniprotkb/uniprot_trembl.fasta.gz")
12+
13+
14+
# Check if sqlite3 is installed
15+
command -v sqlite3 >/dev/null 2>&1 || { echo >&2 "sqlite3 required but it's not installed (brew install sqlite3 [on macos]). Aborting."; exit 1; }
16+
17+
# Check if g++ is installed
18+
command -v g++ >/dev/null 2>&1 || { echo >&2 "g++ required but it's not installed. Aborting."; exit 1; }
19+
20+
# Check if gunzip is installed
21+
command -v gunzip >/dev/null 2>&1 || { echo >&2 "gunzip required but it's not installed. Aborting."; exit 1; }
22+
23+
24+
# Compile C++ program
25+
echo "Compiling C++ programs..."
26+
arch -x86_64 g++ -std=c++17 -o "$mdir"/bin/fasta_to_sqlite "$mdir"/cpp_scripts/fasta_to_sqlite/*.cpp -lsqlite3
27+
if [ $? -ne 0 ]; then
28+
echo "Compilation failed. Please check the source code."
29+
exit 1
30+
fi
31+
32+
g++ -std=c++17 -o "$mdir"/bin/post_process_kmers "$mdir"/cpp_scripts/post_process_kmers/*.cpp
33+
if [ $? -ne 0 ]; then
34+
echo "Compilation failed. Please check the source code."
35+
exit 1
36+
fi
37+
38+
g++ -std=c++17 -o "$mdir"/bin/extract_pdb_coordinates "$mdir"/cpp_scripts/extract_pdb_coordinates/*.cpp
39+
if [ $? -ne 0 ]; then
40+
echo "Compilation failed. Please check the source code."
41+
exit 1
42+
fi
43+
44+
echo "Compiled all C++ programs."
45+
46+
# Check if the database file exists
47+
dbfile="$mdir/uniprotkb/uniprot_sequences.db"
48+
if [[ -f "$dbfile" ]]; then
49+
read -p "The file $dbfile already exists. Do you want to overwrite it? (yes/no): " response
50+
if [[ "$response" == "no" ]]; then
51+
echo "Aborting."
52+
exit 0
53+
elif [[ "$response" == "yes" ]]; then
54+
rm "$dbfile"
55+
else
56+
echo "Invalid response. Please type 'yes' or 'no'."
57+
exit 1
58+
fi
59+
fi
60+
61+
for filepath in "${files[@]}"; do
62+
# Check if file exists
63+
if [[ ! -f "$mdir/$filepath" ]]; then
64+
echo "The required file $filepath is missing."
65+
echo "Please make sure that uniprot_trembl.fasta.gz and uniprot_sprot.fasta.gz are in the 'uniprotkb' directory."
66+
echo "The file can be downloaded from https://www.uniprot.org/downloads"
67+
exit 1
68+
fi
69+
done
70+
71+
# Create database
72+
for filepath in "${files[@]}"; do
73+
echo "Processing $filepath..."
74+
gunzip -c "$mdir/$filepath" | ./bin/fasta_to_sqlite
75+
echo "Done processing $filepath"
76+
done
77+
78+
# Create indexes on the database
79+
echo "Creating indexes..."
80+
sqlite3 "$dbfile" "CREATE INDEX IF NOT EXISTS idx_id ON sequences (id);"
81+
82+
# this index is too large (doubles the DB size)
83+
# sqlite3 $dbfile "CREATE INDEX IF NOT EXISTS idx_sequence ON sequences (sequence);"
84+
echo "Indexes created."
85+
86+
echo "Extracting Residue Coordinates and Generating k-mers..."
87+
PYTHONPATH="${PYTHONPATH}:$mdir" python3 kmers/pipeline.py --handle_all_pdbs true
88+
echo "Done generating k-mers"
89+
90+
k=12
91+
92+
echo "Extracting most frequent k-mers of length k=$k"
93+
./bin/post_process_kmers -a -k "$k > kmers.txt"
94+
echo "Finished. Results in \`kmers.txt\`"
95+
echo "Top k-mers"
96+
cat kmers.txt | head

‎top10000kmers.txt

+10,000
Large diffs are not rendered by default.

‎uniprotkb/README.md

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
This folder should contain two files:
2+
3+
uniprot_sprot.fasta.gz
4+
uniprot_trembl.fasta.gz
5+
6+
The files can be downloaded from https://www.uniprot.org/downloads
7+
They total around 55 GB.

0 commit comments

Comments
 (0)
Please sign in to comment.