cemiu · Jul 29, 2023
diff --git a/‎README.md
+49 b/‎README.md
+49
diff --git a/‎cpp_scripts/.process_natural_set/inputf.txt
+5 b/‎cpp_scripts/.process_natural_set/inputf.txt
+5
diff --git a/‎cpp_scripts/.process_natural_set/outputf.txt
+1 b/‎cpp_scripts/.process_natural_set/outputf.txt
+1
diff --git a/‎cpp_scripts/.process_natural_set/process_natural_set.cpp
+108 b/‎cpp_scripts/.process_natural_set/process_natural_set.cpp
+108
diff --git a/‎cpp_scripts/extract_pdb_coordinates/AtomDataParser.cpp
+19 b/‎cpp_scripts/extract_pdb_coordinates/AtomDataParser.cpp
+19
diff --git a/‎cpp_scripts/extract_pdb_coordinates/AtomDataParser.h
+16 b/‎cpp_scripts/extract_pdb_coordinates/AtomDataParser.h
+16
diff --git a/‎cpp_scripts/extract_pdb_coordinates/Constants.cpp
+22 b/‎cpp_scripts/extract_pdb_coordinates/Constants.cpp
+22
diff --git a/‎cpp_scripts/extract_pdb_coordinates/Constants.h
+42 b/‎cpp_scripts/extract_pdb_coordinates/Constants.h
+42
diff --git a/‎cpp_scripts/extract_pdb_coordinates/Utils.cpp
+136 b/‎cpp_scripts/extract_pdb_coordinates/Utils.cpp
+136
diff --git a/‎cpp_scripts/extract_pdb_coordinates/Utils.h
+20 b/‎cpp_scripts/extract_pdb_coordinates/Utils.h
+20
diff --git a/‎cpp_scripts/extract_pdb_coordinates/extract_pdb_coordinates.cpp
+284 b/‎cpp_scripts/extract_pdb_coordinates/extract_pdb_coordinates.cpp
+284
diff --git a/‎cpp_scripts/extract_pdb_coordinates/inputf.txt
+2,261 b/‎cpp_scripts/extract_pdb_coordinates/inputf.txt
+2,261
diff --git a/‎cpp_scripts/extract_pdb_coordinates/outputf.txt
+114 b/‎cpp_scripts/extract_pdb_coordinates/outputf.txt
+114
diff --git a/‎cpp_scripts/fasta_to_sqlite/inputf.txt
+169 b/‎cpp_scripts/fasta_to_sqlite/inputf.txt
+169
diff --git a/‎cpp_scripts/fasta_to_sqlite/uniprot_to_sqlite.cpp
+70 b/‎cpp_scripts/fasta_to_sqlite/uniprot_to_sqlite.cpp
+70
diff --git a/‎cpp_scripts/post_process_kmers/inputf.txt b/‎cpp_scripts/post_process_kmers/inputf.txt
diff --git a/‎cpp_scripts/post_process_kmers/outputf.txt b/‎cpp_scripts/post_process_kmers/outputf.txt
diff --git a/‎cpp_scripts/post_process_kmers/post_process_kmers.cpp
+137 b/‎cpp_scripts/post_process_kmers/post_process_kmers.cpp
+137
diff --git a/‎kmers/__init__.py b/‎kmers/__init__.py
diff --git a/‎kmers/calculate_kmer.py
+48 b/‎kmers/calculate_kmer.py
+48
diff --git a/‎kmers/pdb_data.py
+116 b/‎kmers/pdb_data.py
+116
diff --git a/‎kmers/pdb_gz_processor.py
+160 b/‎kmers/pdb_gz_processor.py
+160
diff --git a/‎kmers/pipeline.py
+126 b/‎kmers/pipeline.py
+126
diff --git a/‎pdb/README.md
+23 b/‎pdb/README.md
+23
diff --git a/‎prepare.sh
+96 b/‎prepare.sh
+96
diff --git a/‎top10000kmers.txt
+10,000 b/‎top10000kmers.txt
+10,000
diff --git a/‎uniprotkb/README.md
+7 b/‎uniprotkb/README.md
+7
@@ -0,0 +1,49 @@
+# k-mers
+Description and purpose of the project will come at a later point.
+
+## Installation
+
+### Prerequisites
+`g++`: through gcc installation
+`sqlite3`: Debian/Ubuntu: `apt install sqlite3`; macOS: `brew install sqlite3`
+`gzip / gunzip`: usually pre-installed
+`python3`: installed and part of path
+
+### Preperation
+Clone the repo
+```
+git clone https://github.com/cemiu/kmers.git && cd kmers
+```
+Two folders need to be populated.
+#### pdb
+The pdb folder has to contain all experimental PDB file in the .ent.gz format.
+Instructions for downloading can be found here:
+https://www.rcsb.org/docs/programmatic-access/file-download-services
+https://files.wwpdb.org/pub/pdb/data/structures/divided/pdb/
+
+Alternatively an outdated mirror or some can be found here: https://pycom.brunel.ac.uk/misc/pdb_2023-07-28.tar (42 GB)
+
+Once downloaded they have to be placed in the `pdb` folder **without** being uncompressed. It does not matter whether they are in `pdb/file.ent.gz` or `pdb/<folder>/file.ent.gz`.
+
+#### uniprotkb
+The project requires `uniprot_sprot.fasta.gz` (400 MB after processing)
+Optionally, `uniprot_trembl.fasta.gz` can be used, to match more PDBs (250 GB after processing).
+
+The latter might result in (slightly) more PDBs which can be associated to a Protein. The difference is expected to be trivial.
+
+Place the files in the `uniprotkb` folder without uncompressing them.
+By default, only Swiss-Prot is used. To also use TrEMBL, uncomment line 11 in `prepare.sh`.
+
+### Running
+
+To run the script, execute:
+```
+./prepare.sh
+```
+
+This will
+- Compile C++ binaries
+- Process Swiss-Prot / TrEMBL into a database 
+  - (`uniprot_sprot.fasta.gz` / `uniprot_trembl.fasta.gz`) can be deleted afterwards
+- Extract 3d k-mer from the PDBs
+- TODO: process the k-mers
@@ -0,0 +1,5 @@
+A
+B
+C
+D
+E
@@ -0,0 +1 @@
+sdafasefewkj
@@ -0,0 +1,108 @@
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+#include <algorithm>
+
+// using namespace std;
+
+void printHelp() {
+    std::cout << "Usage: ./program [-k <kmer size>] [-f] [-s] [--help]\n"
+         << "Options:\n"
+         << "  -k <kmer size>     Set the size of the kmers (default=12)\n"
+         << "  -f                 Output frequencies only (default=false)\n"
+         << "  -s                 Output statistics and missing kmers (default=false)\n"
+         << "  --help, -h         Show this help message\n";
+}
+
+// Generate synthetic set using recursion
+// void generateSyntheticSet(const string& kmer, int k, const unordered_map<string, int>& naturalSet, const function<void(const string&)>& callback) {
+//     static string aminoAcids = "ACDEFGHIKLMNPQRSTVWY";
+
+//     if (k == 0) {
+//         if (naturalSet.find(kmer) == naturalSet.end()) {
+//             callback(kmer);
+//         }
+//         return;
+//     }
+
+//     for (char c : aminoAcids) {
+//         generateSyntheticSet(kmer + c, k - 1, naturalSet, callback);
+//     }
+// }
+
+// Main function
+int main(int argc, char* argv[]) {
+	std::ios_base::sync_with_stdio(false);
+	std::cin.tie(NULL);
+
+	// Default values
+	int kmerSize = 12;
+	bool outputFrequenciesOnly = false;
+	bool outputStatsAndMissingKmers = false;
+
+
+	// Check command-line arguments
+	for (int i = 1; i < argc; i++) {
+		std::string arg = argv[i];
+		if (arg == "-k") {
+			if (i + 1 < argc) { // Make sure we aren't at the end of argv!
+				kmerSize = std::stoi(argv[++i]); // Increment 'i' so we don't get the arguments confused.
+			}
+			else { // Uh-oh, there was no argument to the kmer option.
+				std::cerr << "-k option requires one argument." << std::endl;
+				return 1;
+			}  
+		}
+		else if (arg == "-f") {
+			outputFrequenciesOnly = true;
+		}
+		else if (arg == "-s") {
+			outputStatsAndMissingKmers = true;
+		}
+		else if (arg == "--help" || arg == "-h") {
+		    printHelp();
+		    return 0;
+		}
+	}
+
+	std::unordered_map<std::string, int> kmerCounts;
+
+	std::string inputLine;
+	while(getline(std::cin, inputLine) && !inputLine.empty()) {
+		if(inputLine.size() >= kmerSize) {
+			std::string kmer = inputLine.substr(0,kmerSize);
+			kmerCounts[kmer]++;
+		}
+	}
+
+	if (kmerCounts.empty()) {
+	    printHelp();
+	    return 1;
+	}
+
+	// Transfer the unordered_map to vector of pairs for sorting.
+	std::vector<std::pair<std::string, int>> sortedKmers(kmerCounts.begin(), kmerCounts.end());
+
+	std::sort(sortedKmers.begin(), sortedKmers.end(), 
+		[](const std::pair<std::string, int> &a, const std::pair<std::string, int> &b) {
+			return a.second > b.second;
+		});
+
+	for(const auto &kmer : sortedKmers) {
+		if(outputFrequenciesOnly) {
+			std::cout << kmer.second << std::endl;
+		} else {
+			std::cout << kmer.second << " " << kmer.first << std::endl;
+		}
+	}
+
+	// Output statistics and missing kmers
+	// if (outputStatsAndMissingKmers) {
+	// 	cout << "Coverage: " << static_cast<double>(kmerCounts.size()) / pow(20, kmerSize) * 100 << "% (" << kmerCounts.size() << "/" << static_cast<int>(pow(20, kmerSize)) << ")\n";
+	// 	generateSyntheticSet("", kmerSize, kmerCounts, [](const string& missingKmer) {
+	// 		cout << missingKmer << "\n";
+	// 	});
+	// }
+
+	return 0;
+}
@@ -0,0 +1,19 @@
+#include "AtomDataParser.h"
+#include <string>
+#include <sstream>
+// #include <iostream>
+
+void parseAtomData(const std::string& str, AtomData& data, size_t offset)
+{
+    std::string atom_name = str.substr(-offset + 12, 4);
+    std::stringstream atom_ss(atom_name);
+    atom_ss >> atom_name;
+    
+    data.isValidAtom = atom_name == "CA"; // whether the atom is ca
+
+    data.resName = str.substr(-offset + 17, 3);
+    data.resSeq = std::stoi(str.substr(-offset + 22, 4));
+    data.x = std::stof(str.substr(-offset + 30, 8));
+    data.y = std::stof(str.substr(-offset + 38, 8));
+    data.z = std::stof(str.substr(-offset + 46, 8));
+}
@@ -0,0 +1,16 @@
+#ifndef ATOMDATAPARSER_H
+#define ATOMDATAPARSER_H
+
+#include <string>
+
+struct AtomData
+{
+    bool isValidAtom; // whether the atom is valid (CA) 
+    std::string resName; // residue name (AA)
+    int resSeq; // residue sequence number
+    float x, y, z;
+};
+
+void parseAtomData(const std::string& str, AtomData& data, size_t offset = 0);
+
+#endif // ATOMDATAPARSER_H
@@ -0,0 +1,22 @@
+#include "Constants.h"
+
+#define X(code, name) name,
+const char *code_name[] = {
+    PDB_PARSING_CODES
+};
+#undef X
+
+const float MAX_RESOLUTION = 2.5f;
+const std::unordered_map<std::string, char> aminoAcidLookup = {
+    {"ALA", 'A'}, {"ARG", 'R'}, {"ASN", 'N'}, {"ASP", 'D'},
+    {"CYS", 'C'}, {"GLN", 'Q'}, {"GLU", 'E'}, {"GLY", 'G'},
+    {"HIS", 'H'}, {"HIP", 'H'}, {"HIE", 'H'}, {"ILE", 'I'},
+    {"LEU", 'L'}, {"LYS", 'K'}, {"MET", 'M'}, {"PHE", 'F'},
+    {"PRO", 'P'}, {"SER", 'S'}, {"THR", 'T'}, {"TYR", 'Y'},
+    {"TRP", 'W'}, {"VAL", 'V'}, {"SEC", 'U'}, {"PYL", 'O'},
+    {"XPL", 'O'}, // for pdb 1L2Q
+    {"GLX", 'Z'}, // for pdb 1KP0 
+    {"ASX", 'B'} // for pdb 1KP0
+    // 3e2o, 2fmd, 2atc, 4cpa
+
+};
@@ -0,0 +1,42 @@
+#ifndef CONSTANTS_H
+#define CONSTANTS_H
+
+#include <unordered_map>
+#include <string>
+
+#define PDB_PARSING_CODES \
+X(SUCCESS, "SUCCESS") \
+X(RESOLUTION_TOO_LOW, "RESOLUTION_TOO_LOW") \
+X(RESOLUTION_NOT_SPECIFIED, "RESOLUTION_NOT_SPECIFIED") \
+X(MISSING_NON_TERMINAL_RESIDUES, "MISSING_NON_TERMINAL_RESIDUES") \
+X(NO_ALPHA_CARBON_ATOMS_FOUND, "NO_ALPHA_CARBON_ATOMS_FOUND") \
+X(IS_NOT_PROTEIN, "IS_NOT_PROTEIN") \
+X(EXCLUDE_RARE_AMINO_ACIDS, "EXCLUDE_RARE_AMINO_ACIDS") \
+X(HAS_UNKNOWN_RESIDUE, "HAS_UNKNOWN_RESIDUE") \
+X(INVALID_SEQUENCE, "INVALID_SEQUENCE") \
+X(NO_UNIPROT_ID, "NO_UNIPROT_ID") \
+
+// rare amino acids = SELENOCYSTEINE, PYRROLYSINE, others
+
+#define X(code, name) code,
+enum PDBParsingCode : size_t {
+    PDB_PARSING_CODES
+    MAX_PDB_PARSING_CODES
+};
+#undef X
+
+extern const char *code_name[MAX_PDB_PARSING_CODES];
+
+enum ResidueConfirmation {
+    RESIDUE_VALID,
+    RESIDUE_DUPLICATE, // same residue multiple times (e.g. multiple confirmation)
+    RESIDUE_OUT_OF_SEQUENCE // missing non-terminal residue
+};
+
+enum PDBType {PROTEIN, DNA, RNA, MISC};
+
+extern const float MAX_RESOLUTION;
+extern const std::unordered_map<std::string, char> aminoAcidLookup;
+
+#endif // CONSTANTS_H
+
@@ -0,0 +1,136 @@
+#include "Utils.h"
+#include <sstream>
+#include <iterator>
+#include <unordered_set>
+#include <iostream>
+
+#include "Constants.h"
+
+std::string concatenateString(const std::vector<std::string>& strings) {
+    const char delim = ',';
+    std::ostringstream oss;
+
+    if (!strings.empty()) {
+        // Convert all but the last element to avoid a trailing delimiter
+        std::copy(strings.begin(), strings.end()-1,
+            std::ostream_iterator<std::string>(oss, &delim));
+
+        // Now add the last element with no delimiter
+        oss << strings.back();
+    }
+
+    return oss.str();
+}
+
+std::string concatenateString(const std::unordered_set<std::string>& strings) {
+    const char delim = ',';
+    std::ostringstream oss;
+
+    for (auto itr = strings.begin(); itr != strings.end(); ++itr) {
+        if (itr != strings.begin()) {
+            oss << delim;
+        }
+        oss << *itr;
+    }
+
+    return oss.str();
+}
+
+// Extracts the resolution from remark 2
+float extractResolution(const std::string &line) {
+    std::string res_section = line.substr(23, 7);
+    // skip empty remark line
+    if (res_section == "       ") {
+        return -1;
+    }
+    try {
+        return std::stof(line.substr(23, 7));
+    } catch(std::invalid_argument) {
+        return -2; // RESOLUTION. NOT APPLICABLE. (e.g. pdb 134D)
+    }
+
+    return std::stof(line.substr(23, 7));
+}
+
+/////////////////////
+/// PROCESS ENTRY ///
+/////////////////////
+
+PDBType processHeader(const std::string &line) {
+    std::string cls = line.substr(10, 40); // 11-50
+    std::string pdbId = line.substr(62, 4); // 63-66
+
+    std::cout << "pdb_id:  " << pdbId << std::endl;
+ 
+    if (cls.find("DNA") != std::string::npos) {
+        if (cls.find("DNA BINDING PROTEIN") == std::string::npos)
+            return DNA;
+    }
+    if (cls.find("RNA") != std::string::npos) {
+        if (cls.find("RNA BINDING PROTEIN") == std::string::npos)
+            return RNA;
+    }
+
+    return PROTEIN;
+}
+
+// Remark row
+void processRemark(const std::string &line, float &resolution) {
+    int remark_no = std::stoi(line.substr(7, 3));
+    switch (remark_no) {
+        case 2:
+            int extractedRes = extractResolution(line);
+            if (extractedRes != -1) {
+                resolution = extractResolution(line);
+            }
+            break;
+    }
+}
+
+// DBRef row
+void processDBRef(const std::string &line, std::unordered_set<std::string> &uniprotIds) {
+    std::string db = line.substr(26, 6); // 27 - 32
+    if (db != "UNP   ") // only match uniprot
+        return;
+
+    std::string uniprotId = line.substr(33, 8); // 34 - 41
+    std::stringstream parser(uniprotId);
+    parser >> uniprotId;
+    uniprotIds.insert(uniprotId);
+}
+
+void processDBRef1(const std::string &line, std::unordered_set<std::string> &uniprotIds) {
+    // process 1 for uniprot
+    std::string db = line.substr(26, 6); // 27 - 32
+    if (db != "UNP   ") // only match uniprot
+        return;
+
+    // process 2 for id
+    std::string nextLine;
+    getline(std::cin, nextLine);
+
+    std::string uniprotId = nextLine.substr(18, 22); // 19 - 40
+    std::stringstream parser(uniprotId);
+    parser >> uniprotId;
+    uniprotIds.insert(uniprotId);
+}
+
+// SEQRES row
+void processSequence(const std::string &line, std::unordered_map<char, std::stringstream> &sequenceStreams) {
+    char chainId = line[11];
+    std::string aa;
+    std::string aaLine = line.substr(19, 51);
+    std::stringstream aaStream(aaLine);
+    while (aaStream >> aa) {
+        try {
+            sequenceStreams[chainId] << aminoAcidLookup.at(aa);
+        } catch (std::out_of_range) {
+            // replace non-standard AA with dot (.)
+            sequenceStreams[chainId] << '.';
+
+            // sequenceStreams.erase(chainId);
+            // throw std::out_of_range("test");
+            return;
+        }
+    }
+}
@@ -0,0 +1,20 @@
+#ifndef UTILS_H
+#define UTILS_H
+
+#include <vector>
+#include <string>
+#include <unordered_set>
+
+#include "Constants.h"
+
+std::string concatenateString(const std::vector<std::string>& strings);
+std::string concatenateString(const std::unordered_set<std::string>& strings);
+
+PDBType processHeader(const std::string &line);
+float extractResolution(const std::string &line);
+void processRemark(const std::string &line, float &resolution);
+void processDBRef(const std::string &line, std::unordered_set<std::string> &uniprotIds);
+void processDBRef1(const std::string &line, std::unordered_set<std::string> &uniprotIds);
+void processSequence(const std::string &line, std::unordered_map<char, std::stringstream> &sequenceStreams);
+
+#endif // UTILS_H
@@ -0,0 +1,284 @@
+#include <iostream>
+#include <unordered_map>
+#include <string>
+#include <vector>
+#include <unordered_set>
+#include <sstream>
+
+#include "AtomDataParser.h"
+#include "Constants.h"
+#include "Utils.h"
+
+
+// bool invalidSequence = false;
+bool invalidAA = false;
+bool hasUnknownResidues = false;
+std::stringstream parsedSequence;
+std::vector<std::string> errorOutput;
+
+int firstCAResidue = 0;
+
+ResidueConfirmation validateAtomSequence(int &prevCAResiduePosition, const int &resSeq) {
+    if (prevCAResiduePosition + 1 != resSeq) {
+        if (prevCAResiduePosition == -1) { // initial value
+            prevCAResiduePosition = resSeq;
+            firstCAResidue = resSeq;
+            return RESIDUE_VALID;
+        }
+
+        if (prevCAResiduePosition == resSeq)
+            return RESIDUE_DUPLICATE;
+
+        std::stringstream errorString;
+        errorString << "missing residues; prev=" << prevCAResiduePosition << ", next=" << resSeq;
+        errorOutput.push_back(errorString.str());
+
+        prevCAResiduePosition = resSeq;
+        return RESIDUE_OUT_OF_SEQUENCE;
+    }
+    prevCAResiduePosition = resSeq;
+    return RESIDUE_VALID;
+}
+
+//////////////////////////
+//////////////////////////
+///// PROCESS ROWS ///////
+//////////////////////////
+//////////////////////////
+
+void processAtom(std::string &line, std::vector<std::string> &output, int &prevCAResiduePosition, bool &isSequenceValid) {
+    AtomData data;
+    parseAtomData(line, data, 0);
+    if (!data.isValidAtom) {
+        return;
+    }
+
+    switch(validateAtomSequence(prevCAResiduePosition, data.resSeq)) {
+    case RESIDUE_VALID:
+        break; // continue
+    case RESIDUE_DUPLICATE:
+        return; // skip to next
+    case RESIDUE_OUT_OF_SEQUENCE:
+        isSequenceValid = false;
+        break;
+    }
+
+    char aminoAcid;
+    try {
+        aminoAcid = aminoAcidLookup.at(data.resName);
+    } catch (std::out_of_range) { // should never throw if pdb is valid & not unknown
+        if (data.resName == "UNK") {
+            hasUnknownResidues = true;
+            return;
+        }
+
+        throw std::runtime_error("Unexpected atom type: " + data.resName);
+    }
+
+    // Selenocysteine, Pyrrolysine, GLX, ASX, too rare, skip
+    if (aminoAcid == 'U' || aminoAcid == 'O' || aminoAcid == 'Z' || aminoAcid == 'B') {
+        invalidAA = true;
+    }
+
+    // construct output string
+    std::stringstream ss;
+    ss << aminoAcid << ' ' << data.x << ' '  << data.y << ' ' << data.z;
+    // std::cout << aminoAcid << ' ' << data.x << ' '  << data.y << ' ' << data.z << std::endl;
+
+    // construct sequence string
+    parsedSequence << aminoAcid;
+
+    output.push_back(ss.str());
+}
+
+// Checks whether the parsed input, so far, produced a valid, sequential
+// list of residues with coordinates.
+// Returns PDBParsingCode.SUCCESS if successful, and a specific error code
+// otherwise.
+PDBParsingCode isPDBInvalid(float &resolution, bool &isSequenceValid, int &prevCAResiduePosition, bool &anyCAAtomsPresent) {
+    bool isResolutionValid = resolution < 2.5;
+    if (!isResolutionValid) { // resolution too low
+        return RESOLUTION_TOO_LOW;
+    }
+
+    if (resolution == -1) // no valid resolution remark returned
+        return RESOLUTION_NOT_SPECIFIED;
+
+    if (!isSequenceValid) // missing non-terminal residues
+        return MISSING_NON_TERMINAL_RESIDUES;
+
+    if (prevCAResiduePosition == -1) { // no single CA atom found
+        if (anyCAAtomsPresent) // if any model had, but last one didn't
+            return MISSING_NON_TERMINAL_RESIDUES;
+        return NO_ALPHA_CARBON_ATOMS_FOUND;
+    }
+
+    return SUCCESS;
+}
+
+PDBParsingCode isPDBInvalid(float &resolution, bool &isSequenceValid, int &prevCAResiduePosition, bool &anyCAAtomsPresent, bool &isNotProtein) {
+    if (isNotProtein)
+        return IS_NOT_PROTEIN;
+    // if (invalidSequence)
+    //     return INVALID_SEQUENCE;
+    if (hasUnknownResidues)
+        return HAS_UNKNOWN_RESIDUE;
+    if (invalidAA)
+        return EXCLUDE_RARE_AMINO_ACIDS;
+    return isPDBInvalid(resolution, isSequenceValid, prevCAResiduePosition, anyCAAtomsPresent);
+}
+
+void resetPDBOutput(std::vector<std::string> &output, bool &isSequenceValid, int &prevCAResiduePosition, bool &anyCAAtomsPresent) {
+    if (!anyCAAtomsPresent && output.size()) {
+        anyCAAtomsPresent = true;
+    }
+
+    output.clear();
+    isSequenceValid = true;
+    hasUnknownResidues = false;
+    invalidAA = false;
+    prevCAResiduePosition = -1;
+    firstCAResidue = 0;
+    parsedSequence.str("");
+}
+
+bool hasMatched = true;
+
+std::vector<std::string> processSequences(std::unordered_map<char, std::stringstream> &sequenceStreams) {
+    std::unordered_set<std::string> uniqueSequences;
+    std::vector<std::string> sequences;
+    std::string matchedSequence = "N/A";
+
+    if (sequenceStreams.size() == 0)
+        return sequences;
+
+    for (const auto & [_chainId, stream] : sequenceStreams) {
+        auto sequence = stream.str();
+        if (sequence.size() != 0) {
+            auto sequencePosition = sequence.find(parsedSequence.str());
+            if (parsedSequence.str().size() > 0 && sequencePosition != std::string::npos) {
+                matchedSequence = sequence;
+            } else {
+                uniqueSequences.insert(sequence);
+            }
+        }
+    }
+
+    // line 4: matched sequence (parsed contained within matched)
+    if (matchedSequence != "")
+        std::cout << "matched: " << matchedSequence << std::endl;
+    // line 5: sequence parsed from ATOM records
+    std::cout << "parsed:  " << parsedSequence.str() << std::endl;
+
+    sequences.push_back(matchedSequence);
+
+    // line 6+: all other parsed sequences
+    for (std::string seq: uniqueSequences) {
+        std::cout << "other:   " << seq << std::endl;
+        sequences.push_back(seq);
+    }
+
+    if (matchedSequence == "N/A")
+        hasMatched = false;
+
+    return sequences;
+}
+
+// Small script for parsing PDB files.
+// Takes in a stream of a PDB file as input.
+int main() {
+    std::ios_base::sync_with_stdio(false);
+    std::cin.tie(NULL);
+
+    std::vector<std::string> output;
+    std::unordered_set<std::string> uniprotIds;
+    bool isSequenceValid = true;
+    bool processed_atom = false;
+    bool anyCAAtomsPresent = false;
+    bool isNotProtein = false;
+    auto resolution = -1.0f;
+    int prevCAResiduePosition = -1;
+
+    std::unordered_map<char, std::stringstream> sequenceStreams;
+    
+    std::string line;
+    while (getline(std::cin, line)) {
+        std::string param = line.substr(0, 6);
+
+        if (param == "HEADER") {
+            auto headerType = processHeader(line);
+            if (headerType != PROTEIN) {
+                isNotProtein = true;
+                // break;
+            }
+        } else if (param == "REMARK") {
+            processRemark(line, resolution);
+        } else if (param == "DBREF ") {
+            processDBRef(line, uniprotIds);
+        } else if (param == "DBREF1") {
+            processDBRef1(line, uniprotIds);
+        } else if (param == "SEQRES") {
+            processSequence(line, sequenceStreams);
+        } else if (param == "ATOM  ") { // HETATM residues are skipped
+            processAtom(line, output, prevCAResiduePosition, isSequenceValid);
+        } else if (param == "TER   ") { // end of one chain
+            auto pdbValidity = isPDBInvalid(resolution, isSequenceValid, prevCAResiduePosition, anyCAAtomsPresent);
+            // std::cout << code_name[pdbValidity] << std::endl;
+            if (pdbValidity == SUCCESS)
+                break; // terminate parser, output PDB
+
+            // if at first you don't succeed, try, try again (parse next model)
+            resetPDBOutput(output, isSequenceValid, prevCAResiduePosition, anyCAAtomsPresent);
+        } // else ignore line, until end is reached
+    }
+
+    // line 1 -- pdb id
+    // "pdb_id:  201L" (printed in Utils.cpp)
+
+    // line 2 -- resolution
+    std::cout << "resolut: " << resolution << std::endl;
+
+    // line 3 -- uniprot IDs
+    std::string allUniprotIds = concatenateString(uniprotIds);
+    std::cout << "uniprot: " << allUniprotIds << std::endl;
+
+    // line 4 -- matched sequence (atom record substring of reqres)
+    // line 5 -- parsed sequence (atom records)
+    // line 6-n -- other sequences (reqres sequence)
+    std::vector<std::string> sequences = processSequences(sequenceStreams);
+    
+
+    auto pdbValidity = isPDBInvalid(resolution, isSequenceValid, prevCAResiduePosition, anyCAAtomsPresent, isNotProtein);
+    if (pdbValidity != SUCCESS) {
+        std::cerr << code_name[pdbValidity] << std::endl;
+
+        for (auto error : errorOutput)
+            std::cout << error << std::endl;
+        return pdbValidity;
+    }
+
+    if (uniprotIds.size() == 0) {
+        std::cerr << code_name[NO_UNIPROT_ID] << std::endl;
+        return NO_UNIPROT_ID;
+    }
+
+    // if (!hasMatched) {
+    //     std::cerr << "NO_MATCH" << std::endl;
+    //     return 103;
+    //     // std::cerr << "NO MATCHED SEQUENCE" << std::endl;
+    // }
+
+    // line n+1: sequence number of initial residue (starts with 1)
+    std::cout << "initres: " << firstCAResidue << std::endl;
+
+    // line n+2: empty line
+    std::cout << std::endl;
+
+    // lines n+3 to end: coordinates in format <residue> <x> <y> <z>
+    for (std::string pos : output) {
+        std::cout << pos << std::endl;
+    }
+
+    return 0;
+}
+
@@ -0,0 +1,114 @@
+pdb_id:  2SPC
+resolut: 1.8
+uniprot: P13395
+matched: QNLDLQLYMRDCELAESWMSAREAFLNADDDANAGGNVEALIKKHEDFDKAINGHEQKIAALQTVADQLIAQNHYASNLVDEKRKQVLERWRHLKEGLIEKRSRLGD
+parsed:  QNLDLQLYMRDCELAESWMSAREAFLNADDDANAGGNVEALIKKHEDFDKAINGHEQKIAALQTVADQLIAQNHYASNLVDEKRKQVLERWRHLKEGLIEKRSRLGD
+initres: 0
+
+Q 9.907 11.72 54.535
+N 12.788 14.14 53.949
+L 11.802 16.244 50.883
+D 15.003 15.15 49.161
+L 14.068 11.571 49.452
+Q 10.65 12.624 48.133
+L 12.115 14.246 45.02
+Y 14.073 11.064 44.305
+M 11.012 8.87 44.708
+R 9.095 11.152 42.291
+D 11.987 11.014 39.843
+C 12.022 7.185 40.088
+E 8.282 7.058 39.496
+L 8.617 9.386 36.438
+A 11.462 7.139 35.063
+E 9.297 4.066 35.531
+S 6.386 5.701 33.763
+W 8.623 6.706 30.78
+M 9.802 3.061 30.69
+S 6.172 1.93 30.638
+A 5.384 4.178 27.708
+R 8.346 2.567 25.825
+E 7.01 -0.902 26.678
+A 3.663 0.075 25.043
+F 5.556 1.418 21.96
+L 7.357 -1.841 21.506
+N 4.03 -3.527 21.436
+A 2.384 -1.339 18.87
+D 5.547 -1.733 16.805
+D 5.224 -5.496 16.92
+D 1.71 -5.1 15.512
+A 3.188 -3.413 12.374
+N 6.848 -2.398 12.824
+A 9.052 -0.791 10.222
+G 9.794 -2.047 6.675
+G 12.498 -4.177 4.969
+N 13.967 -1.043 3.216
+V 17.68 -0.537 3.979
+E 17.636 3.194 4.641
+A 14.899 2.888 7.229
+L 15.787 -0.448 8.811
+I 18.88 1.405 9.657
+K 17.038 4.571 10.711
+K 14.888 2.479 13.062
+H 17.982 0.769 14.366
+E 19.512 4.11 15.342
+D 16.265 5.007 17.029
+F 16.484 1.922 19.286
+D 20.138 2.691 20.086
+K 19.077 6.24 20.997
+A 16.243 4.876 23.217
+I 18.463 2.373 24.949
+N 21.093 4.995 25.504
+G 18.802 7.603 27.025
+H 17.291 4.974 29.279
+E 20.699 3.842 30.514
+Q 21.74 7.36 31.254
+K 18.626 8.228 33.163
+I 19.07 4.99 35.165
+A 22.673 5.942 36.042
+A 21.712 9.362 37.233
+L 19.069 7.939 39.585
+Q 21.583 5.512 40.954
+T 23.72 8.471 41.69
+V 21.053 10.121 43.722
+A 20.244 6.848 45.58
+D 23.921 6.549 46.53
+Q 23.872 10.049 47.957
+L 20.8 9.651 50.047
+I 22.001 6.333 51.437
+A 25.492 7.822 52.194
+Q 23.74 10.702 54.081
+N 21.676 8.217 56.08
+H 24.768 6.134 56.84
+Y 26.644 9.203 57.99
+A 23.826 10.239 60.337
+S 23.465 6.734 61.816
+N 27.196 6.474 62.335
+L 27.336 9.819 64.171
+V 24.369 8.807 66.346
+D 25.834 5.436 67.258
+E 29.307 6.694 67.839
+K 28.077 9.514 70.137
+R 26.142 6.804 72.015
+K 29.251 4.721 72.476
+Q 31.204 7.767 73.797
+V 28.409 8.608 76.23
+L 28.205 4.949 77.433
+E 31.978 4.872 77.948
+R 31.98 8.098 79.926
+W 29.249 6.529 82.085
+R 31.385 3.523 82.57
+H 34.024 5.735 84.157
+L 31.539 7.755 86.235
+K 29.984 4.543 87.496
+E 33.353 3.27 88.454
+G 33.954 6.165 90.738
+L 30.502 6.126 92.265
+I 30.703 2.45 93.297
+E 34.323 2.559 94.435
+K 33.83 5.738 96.532
+R 30.789 3.887 97.846
+S 33.238 1.146 98.961
+R 35.696 3.644 100.393
+L 33.011 4.864 102.758
+G 34.731 2.184 104.901
+D 35.966 -1.307 103.936
@@ -0,0 +1,169 @@
+>sp|Q6GZX4|001R_FRG3G Putative transcription factor 001R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-001R PE=4 SV=1
+MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS
+EKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLD
+AKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHL
+EKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDD
+SFRKIYTDLGWKFTPL
+>sp|Q6GZX3|002L_FRG3G Uncharacterized protein 002L OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-002L PE=4 SV=1
+MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCAR
+IKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSL
+AERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADC
+KCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNML
+DDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRK
+VMFFVAGAVLVAILISTVRW
+>sp|Q197F8|002R_IIV3 Uncharacterized protein 002R OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-002R PE=4 SV=1
+MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWKMNREQALAERYPEL
+QTSEPSEDYSGPVESLELLPLEIKLDIMQYLSWEQISWCKHPWLWTRWYKDNVVRVSAIT
+FEDFQREYAFPEKIQEIHFTDTRAEEIKAILETTPNVTRLVIRRIDDMNYNTHGDLGLDD
+LEFLTHLMVEDACGFTDFWAPSLTHLTIKNLDMHPRWFGPVMDGIKSMQSTLKYLYIFET
+YGVNKPFVQWCTDNIETFYCTNSYRYENVPRPIYVWVLFQEDEWHGYRVEDNKFHRRYMY
+STILHKRDTDWVENNPLKTPAQVEMYKFLLRISQLNRDGTGYESDSDPENEHFDDESFSS
+GEEDSSDEDDPTWAPDSDDSDWETETEEEPSVAARILEKGKLTITNLMKSLGFKPKPKKI
+QSIDRYFCSLDSNYNSEDEDFEYDSDSEDDDSDSEDDC
+>sp|Q197F7|003L_IIV3 Uncharacterized protein 003L OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-003L PE=4 SV=1
+MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGAWFDTSLNARSLTTT
+PSLTTCTPPSLAACTPPTSLGMVDSPPHINPPRRIGTLCFDFGSAKSPQRCECVASDRPS
+TTSNTAPDTYRLLITNSKTRKNNYGTCRLEPLTYGI
+>sp|Q6GZX2|003R_FRG3G Uncharacterized protein 3R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-003R PE=3 SV=1
+MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVYQMSNILLTERRQVD
+RAMGGSDDDGVMVVALSPSDFKTVLGSALLAVERDMVHVVPKYLQTPGILHDMLVLLTPI
+FGEALSVDMSGATDVMVQQIATAGFVDVDPLHSSVSWKDNVSCPVALLAVSNAVRTMMGQ
+PCQVTLIIDVGTQNILRDLVNLPVEMSGDLQVMAYTKDPLGKVPAVGVSVFDSGSVQKGD
+AHSVGAPDGLVSFHTHPVSSAVELNYHAGWPSNVDMSSLLTMKNLMHVVVAEEGLWTMAR
+TLSMQRLTKVLTDAEKDVMRAAAFNLFLPLNELRVMGTKDSNNKSLKTYFEVFETFTIGA
+LMKHSGVTPTAFVDRRWLDNTIYHMGFIPWGRDMRFVVEYDLDGTNPFLNTVPTLMSVKR
+KAKIQEMFDNMVSRMVTS
+>sp|Q6GZX1|004R_FRG3G Uncharacterized protein 004R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-004R PE=4 SV=1
+MNAKYDTDQGVGRMLFLGTIGLAVVVGGLMAYGYYYDGKTPSSGTSFHTASPSFSSRYRY
+>sp|Q197F5|005L_IIV3 Uncharacterized protein 005L OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-005L PE=3 SV=1
+MRYTVLIALQGALLLLLLIDDGQGQSPYPYPGMPCNSSRQCGLGTCVHSRCAHCSSDGTL
+CSPEDPTMVWPCCPESSCQLVVGLPSLVNHYNCLPNQCTDSSQCPGGFGCMTRRSKCELC
+KADGEACNSPYLDWRKDKECCSGYCHTEARGLEGVCIDPKKIFCTPKNPWQLAPYPPSYH
+QPTTLRPPTSLYDSWLMSGFLVKSTTAPSTQEEEDDY
+>sp|Q6GZX0|005R_FRG3G Uncharacterized protein 005R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-005R PE=4 SV=1
+MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFVKRNTGKRLPIGKRS
+NLYVRICDLSGTIYMGETFILESWEELYLPEPTKMEVLGTLESCCGIPPFPEWIVMVGED
+QCVYAYGDEEILLFAYSVKQLVEEGIQETGISYKYPDDISDVDEEVLQQDEEIQKIRKKT
+REFVDKDAQEFQDFLNSLDASLLS
+>sp|Q91G88|006L_IIV6 Putative KilA-N domain-containing protein 006L OS=Invertebrate iridescent virus 6 OX=176652 GN=IIV6-006L PE=3 SV=1
+MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGKRFVDWNKTLRSKKL
+IQYYETRCDIKTESLLYEIKGDNNDEITKQITGTYLPKEFILDIASWISVEFYDKCNNII
+INYFVNEYKTMDKKTLQSKINEVEEKMQKLLNEKEEELQEKNDKIDELILFSKRMEEDRK
+KDREMMIKQEKMLRELGIHLEDVSSQNNELIEKVDEQVEQNAVLNFKIDNIQNKLEIAVE
+DRAPQPKQNLKRERFILLKRNDDYYPYYTIRAQDINARSALKRQKNLYNEVSVLLDLTCH
+PNSKTLYVRVKDELKQKGVVFNLCKVSISNSKINEEELIKAMETINDEKRDV
+>sp|Q6GZW9|006R_FRG3G Uncharacterized protein 006R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-006R PE=4 SV=1
+MYKMYFLKDQKFSLSGTIRINDKTQSEYGSVWCPGLSITGLHHDAIDHNMFEEMETEIIE
+YLGPWVQAEYRRIKG
+>sp|Q6GZW8|007R_FRG3G Uncharacterized protein 007R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-007R PE=4 SV=1
+MRSIKPLRCCNAHGRHVSQEYGRCTLLLFREKLFLQTGLVCNKQCNAPNNDGAESKHHGI
+HHGSRGALALRGAGVHLLASAALGPRVLAGLVPTGRSVQGSVGQCGRVAQIGRARDVAAR
+KQESYCEK
+>sp|Q197F3|007R_IIV3 Uncharacterized protein 007R OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-007R PE=4 SV=1
+MEAKNITIDNTTYNFFKFYNINQPLTNLKYLNSERLCFSNAVMGKIVDDASTITITYHRV
+YFGISGPKPRQVADLGEYYDVNELLNYDTYTKTQEFAQKYNSLVKPTIDAKNWSGNELVL
+LVGNEWYCKTFGKAGSKNVFLYNMIPTIYRDEPQHQEQILKKFMFFNATKNVEQNPNFLD
+NVPEEYYHLLLPKSWVEKNLSDKYRKIMETEHKPLVFSCEPAFSFGLCRNTQDKNESYQL
+SLCLYEREKPRDAEIVWAAKYDELAAMVRDYLKKTPEFKKYRSFISCMKGLSWKNNEIGD
+KDGPKLYPKVIFNRKKGEFVTIFTKDDDVEPETIEDPRTILDRRCVVQAALRLESVFVHN
+KVAIQLRINDVLISEWKEASSKPQPLILRRHRFTKPSSSVAKSTSPSLRNSGSDESDLNQ
+SDSDKEDERVVPVPKTKRIVKTVKLPN
+>sp|Q197F2|008L_IIV3 Uncharacterized protein 008L OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-008L PE=4 SV=1
+MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQNAGDVTNKAYVDQA
+VMSAAVPVASSTTVGTIQMAGDLEGSSGTNPIIAANKITLNKLQKIGPKMVIGNPNSDWN
+NTQEIELDSSFRIVDNRLNAGIVPISSTDPNKSNTVIPAPQQNGLFYLDSSGRVWVWAEH
+YYKCITPSRYISKWMGVGDFQELTVGQSVMWDSGRPSIETVSTQGLEVEWISSTNFTLSS
+LYLIPIVVKVTICIPLLGQPDQMAKFVLYSVSSAQQPRTGIVLTTDSSRSSAPIVSEYIT
+VNWFEPKSYSVQLKEVNSDSGTTVTICSDKWLANPFLDCWITIEEVG
+>sp|Q6GZW6|009L_FRG3G Putative helicase 009L OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-009L PE=4 SV=1
+MDTSPYDFLKLYPWLSRGEADKGTLLDAFPGETFEQSLASDVAMRRAVQDDPAFGHQKLV
+ETFLSEDTPYRELLLFHAPGTGKTCTVVSVAERAKEKGLTRGCIVLARGAALLRNFLHEL
+VFNCGTGGRYIPEGYADMGDQERTRKMRKAVSSYYQFRTYETFAKSVATMSAEAIRARYD
+RFVIVMDEVHHLRSVQAEGVNTYSAISRFLRTVRGCVKMLLTGTPMTNEPGELADVLNLI
+LPQDKTIRPEDGIFSNSGDLLKPDELAERVRGRVSYLKAARPDAGLTFAGEVLGGTGMTH
+LRLVRLEMSAFQSDAYASAWDQDAGDRNIFSNSRQCSLAVMPDRRWGSAAEARNPSQVRR
+MAGQNLAEYSVKYDYLVRVASSSPKTFAYCEYVNGSGLSLLSDILLANGWRRATGRETTP
+GKRFALLTASQKNIHKIVQRFNHEDNVDGAYISLLLGSRVVAEGLTFKEVRHTVILTPHW
+NYTETAQAIARSWRAGSHDRLKARGEAVAVTVHRLVAVPRGRDTPRSIDSDMYAVSEVKD
+KRIKAVERILMTSAADCSLLRSRNLYPSEFDGSRECEYGRCAYRCSNVSVEPGPLPALLG
+ASAAEAVAQVRLDGGGDPAIMKVDMSTLWAEVTAGRRYVNRWGDGAVLRAEGGRLELSAP
+YGSSEEGRWGDFYKTRNLCYAKMDQDHLRADDLRDSLPQEVEELLTVSPVETIGETASAM
+PQEVATAILMACVQARADGKTLNVVRRDALLDFYKGFYAMGPSGWTVWLHARGANAKVYD
+GRRWNPADEDTLEFLAARSAKFTDTRIGYYGLYNPNLKDFCIRDVTQGKRDKVDLRKLTV
+GRRCVDWDQRTLVHIVARLMKIDGRRDFMPHATLREMRELAEQDPLHEPSDLTSKEACRR
+FLFWTQKGDNKFRRQDICKAMEKWFIENDLMEDNFDCGHQHKRRGKFA
+>sp|Q91G85|009R_IIV6 Uncharacterized protein 009R OS=Invertebrate iridescent virus 6 OX=176652 GN=IIV6-009R PE=3 SV=1
+MIKLFCVLAAFISINSACQSSHQQREEFTVATYHSSSICTTYCYSNCVVASQHKGLNVES
+YTCDKPDPYGRETVCKCTLIKCHDI
+>sp|Q6GZW5|010R_FRG3G Uncharacterized protein 010R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-010R PE=4 SV=1
+MKMDTDCRHWIVLASVPVLTVLAFKGEGALALAGLLVMAAVAMYRDRTEKKYSAARAPSP
+IAGHKTAYVTDPSAFAAGTVPVYPAPSNMGSDRFEGWVGGVLTGVGSSHLDHRKFAERQL
+VDRREKMVGYGWTKSFF
+>sp|Q197E9|011L_IIV3 Uncharacterized protein 011L OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-011L PE=4 SV=1
+MMESPKYKKSTCSVTNLGGTCILPQKGATAPKAKDVSPELLVNKMDNLCQDWARTRNEYN
+KVHIEQAPTDSYFGVVHSHTPKKKYTSRDSDSEPEATSTRRSATAQRAANLKSSPVDQWS
+TTPPQPQPQPAAPTVKKTCASSPPAALSVKRTCTSPPPPPVLIDDDTGEDAFYDTNDPDI
+FYDIENGVSELETEGPKRPVYYQRNIRYPIDGSVPQESEQWYDPIDDEFLASSGDVVSLE
+PSPIAAFQPTPPKTVQFVPMPEEIIVPPPPPPKTVVDEGVQAMPYTVDQMIQTDFEESPL
+LANVNLRTIPIEEVNPNFSPVLMQDMVRDSFVFGTVAQRVMASQRVKQFFKELIEQDVSL
+AGRMCMDSGSPQLNLYNSLMGVKLLYRWRSSTTFYRAIVPEIDEPVQVMQDVLSSSEWAK
+FDSQAGIPPKMVYIHYKLLNDLVKTLICPNFQLTHAALVCVDCRPEAVGSDGLQDGRQRR
+CSNLVSEYHEMTLEDLFNTIKPADLNAKNIILSVLFQMLYAVATVQKQFGMGGLFANADS
+VHVRRIQPGGFWHYTVNGLRYSVPNYGYLVILTNFTDVVNYRPDFATTRYFGRRQAKVVP
+TRNWYKFVPFTTRYRPFVTVDPITQAKTTAYAPNPPTEGITINEFYKDSSDLRPSVPVDL
+NDMITFPVPEFHLTICRLFSFFSKFYDSNFIGNDPFVRNLVDRYSQPFEFPDVYWPEDGV
+SRVLACYTIEEIYPNWVDGDTDYVIESYNLD
+>sp|Q6GZW4|011R_FRG3G Uncharacterized protein 011R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-011R PE=4 SV=1
+MTSVKTIAMLAMLVIVAALIYMGYRTFTSMQSKLNELESRVNAPQLRPPVMSPIVPLNFI
+ESEDLDKELD
+>sp|Q6GZW3|012L_FRG3G Uncharacterized protein 012L OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-012L PE=4 SV=1
+MCAKLVEMAFGPVNADSPPLTAEEKESAVEKLVGSKPFPALKKKYHDKVPAQDPKYCLFS
+FVEVLPSCDIKAAGAEEMCSCCIKRRRGQVFGVACVRGTAHTLAKAKQKADKLVGDYDSV
+HVVQTCHVGRPFPLVSSGMAQETVAPSAMEAAEAAMDAKSAEKRKERMRQKLEMRKREQE
+IKARNRKLLEDPSCDPDAEEETDLERYATLRVKTTCLLENAKNASAQIKEYLASMRKSAE
+AVVAMEAADPTLVENYPGLIRDSRAKMGVSKQDTEAFLKMSSFDCLTAASELETMGF
+>sp|Q197E7|013L_IIV3 Uncharacterized protein IIV3-013L OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-013L PE=4 SV=1
+MYYRDQYGNVKYAPEGMGPHHAASSSHHSAQHHHMTKENFSMDDVHSWFEKYKMWFLYAL
+ILALIFGVFMWWSKYNHDKKRSLNTASIFY
+>sp|Q6GZW2|013R_FRG3G Uncharacterized protein 013R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-013R PE=4 SV=1
+MANSVAFSSMTWYSPLASDNLYDICVDKVHNRVLCLCHSFGCCTNAVVIWILPSFDEFTP
+QTLSCKGP
+>sp|Q6GZW1|014R_FRG3G Uncharacterized protein 014R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-014R PE=4 SV=1
+METLVQAYLDIQGKIAEFRREIKALRVEEKAITANLFEAMGEAGVESIRISEDRYLVAEE
+KPKRTRSKQQFYQAAEGEGFTQEDVDRLMSLSRGAVTGSSSNVKIRKSAPARNEEDDDG
+>sp|Q6GZW0|015R_FRG3G Uncharacterized protein 015R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-015R PE=4 SV=1
+MEQVPIKEMRLSDLRPNNKSIDTDLGGTKLVVIGKPGSGKSTLIKALLDSKRHIIPCAVV
+ISGSEEANGFYKGVVPDLFIYHQFSPSIIDRIHRRQVKAKAEMGSKKSWLLVVIDDCMDN
+AKMFNDKEVRALFKNGRHWNVLVVIANQYVMDLTPDLRSSVDGVFLFRENNVTYRDKTYA
+NFASVVPKKLYPTVMETVCQNYRCMFIDNTKATDNWHDSVFWYKAPYSKSAVAPFGARSY
+WKYACSKTGEEMPAVFDNVKILGDLLLKELPEAGEALVTYGGKDGPSDNEDGPSDDEDGP
+SDDEEGLSKDGVSEYYQSDLDD
+>sp|Q6GZV8|017L_FRG3G Uncharacterized protein 017L OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-017L PE=4 SV=1
+METMSDYSKEVSEALSALRGELSALSAAISNTVRAGSYSAPVAKDCKAGHCDSKAVLKSL
+SRSARDLDSAVEAVSSNCEWASSGYGKQIARALRDDAVRVKREVESTRDAVDVVTPSCCV
+QGLAEEAGKLSEMAAVYRCMATVFETADSHGVREMLAKVDGLKQTMSGFKRLLGKTAEID
+GLSDSVIRLGRSIGEVLPATEGKAMRDLVKQCERLNGLVVDGSRKVEEQCSKLRDMASQS
+YVVADLASQYDVLGGKAQEALSASDALEQAAAVALRAKAAADAVAKSLDSLDVKKLDRLL
+EQASAVSGLLAKKNDLDAVVTSLAGLEALVAKKDELYKICAAVNSVDKSKLELLNVKPDR
+LKSLTEQTVVVSQMTTALATFNEDKLDSVLGKYMQMHRFLGMATQLKLMSDSLAEFQPAK
+MAQMAAAASQLKDFLTDQTVSRLEKVSAAVDATDVTKYASAFSDGGMVSDMTKAYETVKA
+FAAVVNSLDSKKLKLVAECAKK
+>sp|Q6GZV7|018L_FRG3G Uncharacterized protein 018L OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-018L PE=3 SV=1
+MQNSKTDMCAALWAVTGLVLNVAVRFALEPFKESMGQGWHTAARVAVNGAIVLALADRLS
+DSPVTMTLFVMALSASPE
+>sp|Q6GZV6|019R_FRG3G Putative serine/threonine-protein kinase 019R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-019R PE=3 SV=1
+MATNYCDEFERNPTRNPRTGRTIKRGGPVFRALERECSDGAARVFPAAAVRGAAAARAAS
+PRVAAASPCPEFARDPTRNPRTGRPIKRGGPVFRALERECADYGGASPRRVSPARAFPNR
+RVSPARRQSPAEAAEASPCPEFARDPTRNPRTGRTIKRGGPTYRALEAECADYGRLSPIR
+SPWSDWSSTGLSPFRSHMRKSPARRSPARRSPARRSLARYTEHLTSDSETEVDYDARNVI
+RSQVGPGGVCERFAADPTRNPVTGSPLSRNDPLYTDLMEICKGYPDTPLTKSLTGEGTDD
+DTCEAFCRDPTRNPVTGQKMRRNGIEYQMFAEECDCSGISRPSGVSRTSGTSGSSGSSAS
+SRPPNSFEAPGASSRPPNSFEASGAARVPGTPSVSRGEPRWMSSISTRHNYDESNPMSVA
+FRLRHVKDIRKFLRTVRPGRSGFCATDKGGWLGSAAVSDNVIGQGSWGSVHMVKFRDFPE
+EFVVKEAVLMSVSEKHRYKPTVVWDEWAAGSVPDEVVVNNMVTEIAATGMTPFVPLTAGA
+GACDSCNPQLLEKAAKVTKCYLQAMEAADFSLDRVLPTMSPDQAASALAQILLGLQSLQT
+TLGIMHNDIKAHNILVKRVPPGGYWKVTDSFNGQVFYIPNEGYLCMLADYGVVRLVKPAV
+GMDTLYGTRNARFVPRDVGRWGKGAGTEYVVTPIRSKISVVVRGGRFVGVEPNKAVRYWK
+NTDTSKVGDVITTNNVFYMGYDIEPDMQVQLDDTNSFPVWESRGDVADCVRTFVGGKRAS
+QPGFHRLFYKKTGSAWEKAAETVAKQNPLFSGFTLDGSGLKYIRAATACAYIFPGMAVPR
+PGEREIESFTM
@@ -0,0 +1,70 @@
+#include <iostream>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <sqlite3.h>
+
+#define CHUNK_SIZE 10000
+
+struct Record {
+    std::string id;
+    std::string sequence;
+};
+
+void chunked_insert(sqlite3* db, std::vector<Record>& records) {
+    char* zErrMsg = 0;
+    sqlite3_exec(db, "BEGIN TRANSACTION;", NULL, NULL, &zErrMsg);
+    
+    for(auto& record : records){
+        std::string sql = "INSERT INTO sequences (id, sequence) VALUES ('" + record.id + "', '" + record.sequence + "');";
+        sqlite3_exec(db, sql.c_str(), NULL, NULL, &zErrMsg);
+    }
+    
+    sqlite3_exec(db, "COMMIT;", NULL, NULL, &zErrMsg);
+    records.clear();
+}
+
+int main() {
+    std::ios_base::sync_with_stdio(false);
+    std::cin.tie(NULL);
+    sqlite3* db;
+    sqlite3_open("uniprotkb/uniprot_sequences.db", &db);
+
+    char* zErrMsg = 0;
+    sqlite3_exec(db, "CREATE TABLE IF NOT EXISTS sequences (id TEXT PRIMARY KEY, sequence TEXT NOT NULL);", NULL, NULL, &zErrMsg);
+    // sqlite3_exec(db, "CREATE INDEX IF NOT EXISTS idx_id ON sequences (id);", NULL, NULL, &zErrMsg);
+    // sqlite3_exec(db, "CREATE INDEX IF NOT EXISTS idx_sequence ON sequences (sequence);", NULL, NULL, &zErrMsg);
+    
+    std::vector<Record> records;
+    std::string line, uniprot_id, sequence;
+    while(std::getline(std::cin, line)) {
+        if(line[0] == '>') {
+            if(!uniprot_id.empty() && !sequence.empty()) {
+                records.push_back({uniprot_id, sequence});
+                sequence.clear();
+                
+                if(records.size() >= CHUNK_SIZE){
+                    chunked_insert(db, records);
+                }
+            }
+
+            // Extracting Uniprot ID
+            int first_pipe = line.find("|");
+            int second_pipe = line.find("|", first_pipe + 1);
+            uniprot_id = line.substr(first_pipe + 1, second_pipe - first_pipe - 1);
+        } else {
+            sequence += line;
+        }
+    }
+    
+    if(!uniprot_id.empty() && !sequence.empty()){
+        records.push_back({uniprot_id, sequence});
+    }
+    
+    if(!records.empty()){
+        chunked_insert(db, records);
+    }
+    
+    sqlite3_close(db);
+    return 0;
+}
@@ -0,0 +1,137 @@
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <sstream>
+#include <unordered_map>
+#include <filesystem>
+#include <vector>
+#include <algorithm>
+#include <map>
+#include <iomanip>
+
+struct PdbInfo {
+    std::string pdb_id;
+    double resolution;
+    int sequence_length;
+};
+
+namespace fs = std::filesystem;
+
+std::unordered_map<std::string, int> global_kmers;
+bool process_all_pdbs = false;
+int kmer_size = 12;
+
+std::vector<std::string> readUniprotFiles(const fs::path& uniprot_path) {
+    std::vector<std::string> file_list;
+    for(const auto& entry : fs::directory_iterator(uniprot_path)) {
+        file_list.push_back(entry.path().string());
+    }
+    return file_list;
+}
+
+PdbInfo parseLine(const std::string& line) {
+    std::istringstream iss(line);
+    PdbInfo info;
+    iss >> info.pdb_id >> info.resolution >> info.sequence_length;
+    return info;
+}
+
+PdbInfo selectPdb(const std::vector<PdbInfo>& pdb_infos) {
+    std::vector<PdbInfo> sorted_infos = pdb_infos;
+    std::sort(sorted_infos.begin(), sorted_infos.end(), [](const PdbInfo& a, const PdbInfo& b) {
+        return a.resolution < b.resolution || (a.resolution == b.resolution && a.sequence_length > b.sequence_length);
+    });
+    
+    for (size_t i = 0; i < sorted_infos.size() - 1; ++i) {
+        if (static_cast<double>(sorted_infos[i].sequence_length) >= 0.8 * sorted_infos[i + 1].sequence_length) {
+            return sorted_infos[i];
+        }
+    }
+    return sorted_infos.back();
+}
+
+PdbInfo parseInfoFile(const std::string& file_path) {
+    std::ifstream file(file_path);
+    std::string line;
+    std::vector<PdbInfo> pdb_infos;
+    while(std::getline(file, line)) {
+        pdb_infos.push_back(parseLine(line));
+    }
+
+    return selectPdb(pdb_infos);
+}
+
+void parseKmersFile(const std::string& file_path) {
+    std::ifstream file(file_path);
+    std::string line;
+    while(std::getline(file, line)) {
+        if(line.length() >= kmer_size) {
+            global_kmers[line.substr(0, kmer_size)]++;
+        }
+    }
+}
+
+int main(int argc, char** argv) {
+    for(int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+        if(arg == "-a") {
+            process_all_pdbs = true;
+        } else if(arg == "-k" && i + 1 < argc) {
+            kmer_size = std::stoi(argv[++i]);
+        } else if(arg == "-h" || arg == "--help") {
+            std::cout << "Usage: " << argv[0] << " [OPTIONS]\n"
+                      << "Options:\n"
+                      << "  -a            Process all PDBs\n"
+                      << "  -k <value>    Specify the size of the k-mers\n"
+                      << "  -h, --help    Display this help message and exit\n";
+            return 0;
+        }
+    }
+
+    fs::path uniprot_path = "./pdb_output/uniprot";
+    fs::path pdbs_path = "./pdb_output/pdbs";
+    std::vector<std::string> file_list;
+
+    if(process_all_pdbs) {
+        for(const auto& entry : fs::directory_iterator(pdbs_path)) {
+            file_list.push_back(entry.path().c_str());
+        }
+    } else {
+        file_list = readUniprotFiles(uniprot_path);
+    }
+
+    int total_files = file_list.size();
+
+    for(int i = 0; i < total_files; ++i) {
+        if(process_all_pdbs) {
+            parseKmersFile(file_list[i]);
+        } else {
+            PdbInfo selected_pdb = parseInfoFile(file_list[i]);
+
+            std::string kmers_file_path = (pdbs_path / (selected_pdb.pdb_id + ".kmers")).c_str();
+            if(fs::exists(kmers_file_path)) {
+                parseKmersFile(kmers_file_path);
+            }
+        }
+
+        if((i + 1) % 100 == 0 || i + 1 == total_files) {
+            std::cerr << "\rProcessed " << (i + 1) << " / " << total_files << "; " 
+                      << std::fixed << std::setprecision(2) << static_cast<double>(i + 1) / total_files * 100 << "%";
+            std::cerr.flush();
+        }
+    }
+
+    std::cerr << std::endl << "Prepairing results..." << std::endl;
+
+    std::vector<std::pair<std::string, int>> sorted_kmers(global_kmers.begin(), global_kmers.end());
+
+    std::sort(sorted_kmers.begin(), sorted_kmers.end(), [](const auto& a, const auto& b) {
+        return a.second > b.second;
+    });
+
+    for(const auto& [kmer, freq] : sorted_kmers) {
+        std::cout << kmer << " " << freq << std::endl;
+    }
+
+    return 0;
+}
@@ -0,0 +1,48 @@
+from sklearn.neighbors import KDTree
+
+from kmers.pdb_data import PDBData
+
+import networkx as nx
+
+
+def calculate_kmers(pdb_data: PDBData, generate_graph: bool = False) -> list[str] or tuple[list[str], nx.Graph]:
+    """
+    Takes in a file of experimental data in the format <AA> <x> <y> <z>
+    and constructs proximity based k-mers for each, within 15 angstroms.
+
+    Uses a KDTree-based approach
+    """
+    residues = pdb_data.residue_list
+    coordinates = pdb_data.coordinates
+
+    indices, distances = _nearest_neighbours(residues, coordinates)
+
+    # if e.g. a graph is needed, this is the place to do it
+    # something like, maybe this works?
+    graph = nx.Graph()
+    if generate_graph:
+        for residue_idx, (ind, dist) in enumerate(zip(indices, distances)):
+            for neighbor_idx, distance in zip(ind, dist):
+                if residue_idx != neighbor_idx:  # avoid self-connections
+                    graph.add_edge(residue_idx, neighbor_idx, weight=distance)
+
+        # nx.write_graphml(graph, "residues.graphml")
+
+    closest_letters = [''.join([residues[i] for i in ind]) for ind in indices]
+
+    if generate_graph:
+        return closest_letters, graph
+
+    return closest_letters
+
+
+def _nearest_neighbours(_residues, coordinates, search_radius=15):
+    """
+    Returns 2 lists, each containing lists of indices of residues and distances.
+    For every residue, the list will be a sorted list of all residues within the search radius.
+    default=15 angstroms
+    """
+    tree = KDTree(coordinates)
+    indices, distances = tree.query_radius(coordinates, r=search_radius, return_distance=True, sort_results=True)
+
+    return indices, distances
@@ -0,0 +1,116 @@
+
+"""
+PDBData takes a byte stream (produces by extract_pdb_coordinates) and parses it into a PDBData object.
+
+The format of the byte stream is as follows:
+    1. First line is resolution:
+        'resolut: {resolution}' (float)
+    2. Second line is uniprot_id. Can be comma separated if multiple uniprot_ids are found:
+        'uniprot: {uniprot_id}' (str)
+    3. Third line is matched sequence. This is the sequence that is found in the PDB file (SEQRES)
+        'matched: {sequence}' (str)
+    4. Fourth line is parsed sequence. These are the residues that are found in the PDB file (ATOM)
+        'parsed:  {parsed_sequence}' (str)
+    5. Lines 5 to n are other sequences that are found in the PDB file (SEQRES)
+        'other:   {other_sequence}' (str)
+    6. Lines n+1 is a blank line
+    7. Lines n+2 to m are the coordinates of the PDB file (ATOM)
+        '{residue_name [e.g. A/E/M]} {x} {y} {z}' (char, float, float, float)
+"""
+
+
+class PDBData:
+    """Takes in a byte stream"""
+    def __init__(self, pdb_byte_stream):
+        self._pdb_id = None
+        self._resolution = None
+        self._parsed_sequence = None
+        self._first_residue_number = None
+        self._residue_list = []
+        self._coordinates = []
+
+        self._input_uniprot_ids = []
+        self._input_matched_sequence = None
+        self._input_other_sequences = []
+
+        self._parse(pdb_byte_stream)
+
+    def _parse(self, pdb_byte_stream):
+        lines = pdb_byte_stream.decode('utf-8').split("\n")
+
+        # Parse pdb id
+        self._pdb_id = lines[0].split(':  ')[1]
+
+        # Parse the resolution
+        self._resolution = float(lines[1].split(': ')[1])
+
+        # Parse the uniprot ids
+        self._input_uniprot_ids = lines[2].split(': ')[1].split(',')
+
+        # Parse the matched sequence
+        self._input_matched_sequence = lines[3].split(': ')[1]
+        # if self._input_matched_sequence == 'N/A':
+        #     print('N/A')
+
+        # Parse the parsed sequence
+        self._parsed_sequence = lines[4].split(':  ')[1]
+
+        # Parse the other sequences
+        idx = 5
+        while lines[idx].startswith('other:   '):
+            self._input_other_sequences.append(lines[idx].split(': ')[1])
+            idx += 1
+
+        # parse the first residue number (initres)
+        self._first_residue_number = int(lines[idx].split(': ')[1])
+
+        # Parse the coordinates
+        idx += 2  # skip the initres and blank line
+        while idx < len(lines) and lines[idx].strip():  # Ensure we are not at the end or at a blank line
+            parts = lines[idx].split()
+            residue_name = parts[0]
+            x, y, z = map(float, parts[1:])
+            self._residue_list.append(residue_name)
+            self._coordinates.append((x, y, z))
+            idx += 1
+
+    @property
+    def pdb_id(self):
+        return self._pdb_id
+
+    @property
+    def resolution(self):
+        return self._resolution
+
+    @property
+    def first_residue_number(self):
+        return self._first_residue_number
+
+    @property
+    def residue_sequence_parsed(self):
+        """
+        The parsed sequence is the sequence extracted from the PDB ATOM records.
+        :return:
+        """
+        return self._parsed_sequence
+
+    @property
+    def residue_sequence_matched(self):
+        """
+        The matched sequence is the sequence extracted from the PDB SEQRES records,
+        of which the parsed sequence is a substring of.
+        :return:
+        """
+        return self._input_matched_sequence
+
+    @property
+    def uniprot_ids(self):
+        return self._input_uniprot_ids
+
+    @property
+    def residue_list(self):
+        return self._residue_list
+
+    @property
+    def coordinates(self):
+        return self._coordinates
@@ -0,0 +1,160 @@
+import gzip
+import sqlite3
+import subprocess
+import time
+from pathlib import Path
+
+from kmers.calculate_kmer import calculate_kmers
+from kmers.pdb_data import PDBData
+
+
+class GZProcessor:
+    def __init__(self, db_path, process_dir, out_uniprot_dir, out_pdbs_dir, handle_all_pdbs):
+        self.db_path = db_path
+        self.process_dir = process_dir
+        self.out_uniprot_dir = out_uniprot_dir
+        self.out_pdbs_dir = out_pdbs_dir
+        self.handle_all_pdbs = handle_all_pdbs
+
+        if not self.handle_all_pdbs:
+            self.conn = sqlite3.connect(f'file:{self.db_path}?mode=ro', uri=True)
+
+        self.codes = {'SUCCESS': 0}
+        self.max_pdb_count = 1  # to avoid division by zero
+        self.cur_pdb_count = 0
+
+    def process_gz_file(self, gz_file):
+
+        # 1. extract coordinates
+        try:
+            parsed_pdb = self.extract_coordinates(gz_file)
+            self.codes['SUCCESS'] += 1
+        except Exception as e:
+            self.codes[str(e)] = self.codes.get(str(e), 0) + 1
+            return
+
+        # 2. parse data
+        pdb_data = PDBData(parsed_pdb)
+
+        # 3. find matching uniprot entry, reject if not found
+        if not self.handle_all_pdbs:
+            uniprot_id = self.get_matching_uniprot_entry(pdb_data)
+            if uniprot_id is None:
+                self.codes['SUCCESS'] -= 1
+                self.codes['NO_UNIPROT_ID'] = self.codes.get('NO_UNIPROT_ID', 0) + 1
+                return
+
+        # print(f'{pdb_id} -> {uniprot_id}')
+
+        kmers = calculate_kmers(pdb_data)
+        # 4. write data to pdb & uniprot files
+        self._write_pdb_file(pdb_data.pdb_id, kmers)
+        if not self.handle_all_pdbs:
+            self._append_to_uniprot_file(uniprot_id, pdb_data.pdb_id, pdb_data)  # noqa
+
+        # 5. pass kmers to natural set parser
+        # TODO
+
+    @staticmethod
+    def extract_coordinates(gz_file):
+        proc = subprocess.Popen(['bin/extract_pdb_coordinates'],
+                                stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        with gzip.open(gz_file, 'r') as f_in:
+            pdb_str = f_in.read()
+
+        parsed_pdb, err = proc.communicate(input=pdb_str)
+
+        if proc.returncode != 0:
+            raise Exception(err.decode('utf-8').splitlines()[0])
+
+        return parsed_pdb
+
+    def process_files(self):
+        """
+        Process all files in the process_dir
+        :return:
+        """
+        self.max_pdb_count = count_files(self.process_dir, '*.ent.gz')
+        print(f'Processing {self.max_pdb_count} PDB files files...')
+
+        time_start = time.time()
+
+        for gz_file in Path(self.process_dir).rglob('*.ent.gz'):
+            self.process_gz_file(gz_file)
+            self.cur_pdb_count += 1
+
+            if self.cur_pdb_count % 100 == 0:
+                self.print_progress()
+
+        self.print_progress()
+        time_end = time.time()
+
+        self.print_codes()
+        print(f'\nCompleted in {time_end - time_start:.2f} seconds')
+
+    def print_progress(self):
+        print(f'\r{self.cur_pdb_count:<{len(str(self.max_pdb_count))}} / {self.max_pdb_count}, '
+              f'{self.cur_pdb_count / self.max_pdb_count:.1%}', end='')
+
+    def print_codes(self):
+        print()
+        for k, v in self.codes.items():
+            print(f'{k}: {v}')
+
+    def get_matching_uniprot_entry(self, pdb_data):
+        """
+        Fetch sequences by uniprot ids first and then check for sequence match.
+        """
+        sequence = pdb_data.residue_sequence_parsed
+        pdb_uniprot_ids = pdb_data.uniprot_ids
+        first_residue_number = pdb_data.first_residue_number
+
+        cur = self.conn.cursor()
+
+        # Prepare IDs for the query
+        placeholders = ', '.join(['?'] * len(pdb_uniprot_ids))
+        query = f'SELECT id, sequence FROM sequences WHERE id IN ({placeholders})'
+
+        cur.execute(query, tuple(pdb_uniprot_ids))
+        ids_and_sequences = cur.fetchall()
+
+        if len(ids_and_sequences) == 0:
+            return None
+
+        matched_uniprot_ids = []
+        # Check for exact match if first_residue_number is not 0
+        if first_residue_number != 0:
+            # Find sequences that exactly match the input sequence starting and ending at the given points
+            matched_uniprot_ids = [entry[0] for entry in ids_and_sequences if sequence ==
+                                   entry[1][first_residue_number - 1:first_residue_number + len(sequence) - 1]]
+
+        # If no exact matches or first_residue_number is 0, find sequences that have the input sequence as a substring
+        if not matched_uniprot_ids:
+            matched_uniprot_ids = [entry[0] for entry in ids_and_sequences if sequence in entry[1]]
+
+        all_matches = [x for x in pdb_uniprot_ids if x in matched_uniprot_ids]
+
+        # if len(all_matches) > 1:
+        #     print(f'Found multiple matches for {sequence}: {all_matches}')
+        return all_matches[0] if len(all_matches) > 0 else None
+
+    def _write_pdb_file(self, pdb_id: str, kmers: list[str]):
+        with open(f'{self.out_pdbs_dir}/{pdb_id}.kmers', 'w') as f_out:
+            for kmer in kmers:
+                f_out.write(f'{kmer}\n')
+
+    def _append_to_uniprot_file(self, uniprot_id: str, pdb_id: str, pdb_data: PDBData):
+        if not Path(f'{self.out_uniprot_dir}/{uniprot_id}.info').exists():
+            with open(f'{self.out_uniprot_dir}/{uniprot_id}.info', 'w') as f_out:
+                f_out.write(f'>{uniprot_id}\n')
+
+        with open(f'{self.out_uniprot_dir}/{uniprot_id}.info', 'a') as f_out:
+            f_out.write(f'{pdb_id} {pdb_data.resolution} {len(pdb_data.residue_sequence_parsed)} '
+                        f'{pdb_data.residue_sequence_parsed}\n')
+
+
+def count_files(directory='.', extension='*'):
+    count = 0
+    for _ in Path(directory).rglob(extension):
+        count += 1
+    return count
@@ -0,0 +1,126 @@
+import argparse
+import os
+from pathlib import Path
+
+from kmers.pdb_gz_processor import GZProcessor
+
+"""
+- 1. OS walk through all PDB gz files 
+- 2. Extract the gz files
+- 3. extract the PDB coordinates annotated with CA (with C++ script) (PDB_ID.coor)
+-     3.1. If unsuccessful (return 1), delete extracted file & created file, next file
+-     3.2 If successful, delete extracted file & continue to 4.
+- 4. Calculate the Kmers
+5. Write them to file (PDB_ID.kmers)
+6. Repeat for all files
+
+After batch processing all files:
+# 7. Associate each PDB with a protein from the DB
+# 8. Select the PDB with the highest resolution
+...
+
+9. Read all Kmers into memory (set), with frequency information
+    9.1. Truncate to 12 residues
+    9.2. Discard ones which are shorter
+10. Save natural set where n=12 to disk, along with frequencies
+11. Generate synthetic set. (all possible Kmers of n=12)
+12. Calculate difference
+13. Extract statistics on freq. dist of natural set, fraction appearing in synthetic set, etc.
+"""
+
+
+standard_error_codes = {
+    "RESOLUTION_TOO_LOW",
+    "MISSING_NON_TERMINAL_RESIDUES",
+    "IS_NOT_PROTEIN",
+    "NO_ALPHA_CARBON_ATOMS_FOUND",
+    "EXCLUDE_SELENOCYSTEINE_AND_PYRROLYSINE",
+    "NO_UNIPROT_ID"
+}
+
+
+def check_empty_directory(directory):
+    if not os.path.exists(directory):
+        return
+
+    if os.listdir(directory):
+        raise RuntimeError(f"Error: Output directory '{directory}' is not empty. Aborting.")
+
+
+def check_file_exists(file_path):
+    if not os.path.exists(file_path):
+        raise RuntimeError(f"Error: File '{file_path}' does not exist. Aborting.")
+
+    if not os.path.isfile(file_path):
+        raise RuntimeError(f"Error: Path '{file_path}' is not a file. Aborting.")
+
+
+def check_directory_exists(directory_path):
+    if not os.path.exists(directory_path):
+        raise RuntimeError(f"Error: Directory '{directory_path}' does not exist. Aborting.")
+
+    if not os.path.isdir(directory_path):
+        raise RuntimeError(f"Error: Path '{directory_path}' is not a directory. Aborting.")
+
+
+def check_directory_contains_files(directory_path, extension=""):
+    for f in Path(directory_path).rglob(f"*{extension}"):
+        if f.is_file():
+            return True
+    if extension == "":
+        raise RuntimeError(f"Error: Directory '{directory_path}' does not contain any files. Aborting.")
+    else:
+        raise RuntimeError(f"Error: Directory '{directory_path}' does not contain any files with extension "
+                           f"'{extension}'. Aborting.")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Process PDB files.')
+    parser.add_argument('--handle_all_pdbs', required=True, type=bool,
+                        help='Set to True to handle all PDBs without checking for uniprot IDs')
+    args = parser.parse_args()
+
+    if args.handle_all_pdbs not in [True, False]:
+        raise ValueError("The --handle_all_pdbs argument must be set to either True or False.")
+
+    db_path = os.path.expanduser('uniprotkb/uniprot_sequences.db')
+
+    if not args.handle_all_pdbs:  # Only check for database if we need it
+        try:
+            check_file_exists(db_path)
+        except RuntimeError:
+            print("Error: Database of uniprot sequences not found. Run ./prepare.sh after downloading"
+                  " uniprot_sprot.fasta.gz and (optionally) uniprot_trembl.fasta.gz into the uniprotkb dir."
+                  "Disk space for sprot only is ~400MB, or ~250GB for both.")
+            exit(1)
+
+    process_dir = 'pdb'
+    try:
+        check_directory_exists(process_dir)
+        check_directory_contains_files(process_dir, extension='.ent.gz')
+    except RuntimeError:
+        print("Error: Directory 'pdb' not found or does not contain any .ent.gz files. "
+              "It should contain e.g. pdb/a0/1a00.ent.gz, pdb/a0/1a01.ent.gz, etc."
+              "Download PDB files from ftp://ftp.wwpdb.org/pub/pdb/data/structures/divided/pdb/ "
+              "and extract them into the pdb directory."
+              "A mirror can be found at https://pycom.brunel.ac.uk/misc/ (42GB tar file * 2 = 84GB)")
+        exit(1)
+
+    output_dir = 'pdb_output'
+    try:
+        check_empty_directory(output_dir)
+    except RuntimeError:
+        print("Error: Output directory 'pdb_output' is not empty. Aborting."
+              "If you want to re-run the script, delete the directory first.")
+        exit(1)
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+        os.makedirs(os.path.join(output_dir, 'uniprot'))
+        os.makedirs(os.path.join(output_dir, 'pdbs'))
+
+    out_uniprot = os.path.join(output_dir, 'uniprot')
+    out_pdbs = os.path.join(output_dir, 'pdbs')
+
+    processor = GZProcessor(db_path, process_dir, out_uniprot, out_pdbs, args.handle_all_pdbs)
+    processor.process_files()
@@ -0,0 +1,23 @@
+# PDB Folder
+
+This folder should contain compressed PDB files in the following format:
+
+(dividing not required, can be in the same directry)
+```
+pdb/a0/1a00.ent.gz
+pdb/a0/1a01.ent.gz
+...
+```
+
+These can be downloaded downloaded from the official hosts / mirrors for PDB:
+
+Official website describing how to download:
+https://www.rcsb.org/docs/programmatic-access/file-download-services
+
+Main mirror:
+https://files.wwpdb.org/pub/pdb/data/structures/divided/pdb/
+
+Other mirrors:
+https://www.wwpdb.org/ftp/pdb-ftp-sites
+
+Alternatively, a tarball of all pdbs (as of 2023-07-28) is hosted on https://pycom.brunel.ac.uk/misc/pdb_2023-07-28.tar (42GB large, needs to be uncompressed, so 84GB temporary storage needed)
@@ -0,0 +1,96 @@
+#!/bin/bash
+
+mdir=$(dirname $(realpath "$0"))
+cd "$mdir"
+
+# List of uniprot files.
+# SwissProt only: 92 MB .gz file to 250 MB database
+files=("uniprotkb/uniprot_sprot.fasta.gz")
+
+# SwissProt+TrEMBL: 62 GB .gz files to ~191 GB database
+# files=("uniprotkb/uniprot_sprot.fasta.gz", "uniprotkb/uniprot_trembl.fasta.gz")
+
+
+# Check if sqlite3 is installed
+command -v sqlite3 >/dev/null 2>&1 || { echo >&2 "sqlite3 required but it's not installed (brew install sqlite3 [on macos]). Aborting."; exit 1; }
+
+# Check if g++ is installed
+command -v g++ >/dev/null 2>&1 || { echo >&2 "g++ required but it's not installed. Aborting."; exit 1; }
+
+# Check if gunzip is installed
+command -v gunzip >/dev/null 2>&1 || { echo >&2 "gunzip required but it's not installed. Aborting."; exit 1; }
+
+
+# Compile C++ program
+echo "Compiling C++ programs..."
+arch -x86_64 g++ -std=c++17 -o "$mdir"/bin/fasta_to_sqlite "$mdir"/cpp_scripts/fasta_to_sqlite/*.cpp -lsqlite3
+if [ $? -ne 0 ]; then
+    echo "Compilation failed. Please check the source code."
+    exit 1
+fi
+
+g++ -std=c++17 -o "$mdir"/bin/post_process_kmers "$mdir"/cpp_scripts/post_process_kmers/*.cpp
+if [ $? -ne 0 ]; then
+    echo "Compilation failed. Please check the source code."
+    exit 1
+fi
+
+g++ -std=c++17 -o "$mdir"/bin/extract_pdb_coordinates "$mdir"/cpp_scripts/extract_pdb_coordinates/*.cpp
+if [ $? -ne 0 ]; then
+    echo "Compilation failed. Please check the source code."
+    exit 1
+fi
+
+echo "Compiled all C++ programs."
+
+# Check if the database file exists
+dbfile="$mdir/uniprotkb/uniprot_sequences.db"
+if [[ -f "$dbfile" ]]; then
+    read -p "The file $dbfile already exists. Do you want to overwrite it? (yes/no): " response
+    if [[ "$response" == "no" ]]; then
+        echo "Aborting."
+        exit 0
+    elif [[ "$response" == "yes" ]]; then
+        rm "$dbfile"
+    else
+        echo "Invalid response. Please type 'yes' or 'no'."
+        exit 1
+    fi
+fi
+
+for filepath in "${files[@]}"; do
+    # Check if file exists
+    if [[ ! -f "$mdir/$filepath" ]]; then
+        echo "The required file $filepath is missing."
+        echo "Please make sure that uniprot_trembl.fasta.gz and uniprot_sprot.fasta.gz are in the 'uniprotkb' directory."
+        echo "The file can be downloaded from https://www.uniprot.org/downloads"
+        exit 1
+    fi
+done
+
+# Create database
+for filepath in "${files[@]}"; do
+    echo "Processing $filepath..."
+    gunzip -c "$mdir/$filepath" | ./bin/fasta_to_sqlite
+    echo "Done processing $filepath"
+done
+
+# Create indexes on the database
+echo "Creating indexes..."
+sqlite3 "$dbfile" "CREATE INDEX IF NOT EXISTS idx_id ON sequences (id);"
+
+# this index is too large (doubles the DB size)
+# sqlite3 $dbfile "CREATE INDEX IF NOT EXISTS idx_sequence ON sequences (sequence);"
+echo "Indexes created."
+
+echo "Extracting Residue Coordinates and Generating k-mers..."
+PYTHONPATH="${PYTHONPATH}:$mdir" python3 kmers/pipeline.py --handle_all_pdbs true
+echo "Done generating k-mers"
+
+k=12
+
+echo "Extracting most frequent k-mers of length k=$k"
+./bin/post_process_kmers -a -k "$k > kmers.txt"
+echo "Finished. Results in \`kmers.txt\`"
+echo "Top k-mers"
+cat kmers.txt | head
@@ -0,0 +1,7 @@
+This folder should contain two files:
+
+uniprot_sprot.fasta.gz
+uniprot_trembl.fasta.gz
+
+The files can be downloaded from https://www.uniprot.org/downloads
+They total around 55 GB.
-Original file line number
+Diff line change
 +A
 +B
 +C
 +D
 +E