cemiu
diff --git a/‎.gitignore
+6-3 b/‎.gitignore
+6-3
diff --git a/‎README.md
+24-16 b/‎README.md
+24-16
diff --git a/‎cpp_scripts/extract_pdb_coordinates/AtomDataParser.h
+17-3 b/‎cpp_scripts/extract_pdb_coordinates/AtomDataParser.h
+17-3
diff --git a/‎cpp_scripts/extract_pdb_coordinates/AtomDataParser.cpp ‎cpp_scripts/extract_pdb_coordinates/AtomDataParser.sdafdsafcpp.cpp b/‎cpp_scripts/extract_pdb_coordinates/AtomDataParser.cpp ‎cpp_scripts/extract_pdb_coordinates/AtomDataParser.sdafdsafcpp.cpp
diff --git a/‎cpp_scripts/extract_pdb_coordinates/Constants.cpp
+7-3 b/‎cpp_scripts/extract_pdb_coordinates/Constants.cpp
+7-3
diff --git a/‎cpp_scripts/extract_pdb_coordinates/Constants.h
+3-3 b/‎cpp_scripts/extract_pdb_coordinates/Constants.h
+3-3
diff --git a/‎cpp_scripts/extract_pdb_coordinates/PDBContext.h
+50 b/‎cpp_scripts/extract_pdb_coordinates/PDBContext.h
+50
diff --git a/‎cpp_scripts/extract_pdb_coordinates/Utils.cpp
+12-14 b/‎cpp_scripts/extract_pdb_coordinates/Utils.cpp
+12-14
diff --git a/‎cpp_scripts/extract_pdb_coordinates/Utils.h
+6-6 b/‎cpp_scripts/extract_pdb_coordinates/Utils.h
+6-6
diff --git a/‎cpp_scripts/extract_pdb_coordinates/extract_pdb_coordinates
361 KB b/‎cpp_scripts/extract_pdb_coordinates/extract_pdb_coordinates
361 KB
@@ -93,10 +93,13 @@ ENV/
 
 .idea/
 .DS_Store
-*/inputf.txt
-*/outputf.txt
+inputf.txt
+outputf.txt
 bin/
 kmers.txt
-pdb/**.ent.gz
+pdb/*.ent.gz
+pdb/*/*.ent.gz
+uniprotkb/**.fasta.gz
+
 
 .vscode/*
@@ -6,22 +6,23 @@ Description and purpose of the project will come at a later point.
 ### Prerequisites
 `g++`: through gcc installation
 
-`sqlite3`: Debian/Ubuntu: `apt install sqlite3`; macOS: `brew install sqlite3`
-
 `gzip / gunzip`: usually pre-installed
 
 `python3`: installed and part of path
 
-### Preperation
+optional: `sqlite3`: Debian/Ubuntu: `apt install sqlite3`; macOS: `brew install sqlite3`
+
+### Preparation
+
 Clone the repo
 ```
 git clone https://github.com/cemiu/kmers.git && cd kmers
 ```
 Two folders need to be populated:
-- pdb
-- uniprotkb
+- `pdb` (required)
+- `uniprotkb` (optional)
 ### pdb
-The pdb folder has to contain all experimental PDB file in the .ent.gz format.
+The `pdb` folder has to contain experimental PDB files in the .ent.gz format.
 
 Instructions for downloading can be found here:
 
@@ -31,18 +32,24 @@ https://files.wwpdb.org/pub/pdb/data/structures/divided/pdb/
 
 Alternatively a mirror can be found here: https://pycom.brunel.ac.uk/misc/pdb_2023-07-28.tar (42 GB)
 
-Once downloaded they have to be placed in the `pdb` folder **without** being uncompressed. It does not matter whether they are in `pdb/file.ent.gz` or `pdb/<folder>/file.ent.gz`.
+Once downloaded they have to be placed in the `pdb` folder **without** being decompressed.
+It does not matter whether they are divided; e.g. `pdb/file.ent.gz` or `pdb/<folder>/file.ent.gz`.
 
 ### uniprotkb
-The project requires `uniprot_sprot.fasta.gz` (400 MB after processing)
+**Optionally**, the `uniprotkb` folder can be populated with the uniprotkb fasta files.
+This is only needed, if the PDBs should be associated to a Protein. If this is not required, **skip this step**.
 
-Optionally, `uniprot_trembl.fasta.gz` can be used, to match more PDBs (250 GB after processing).
+Files:
+- `uniprot_sprot.fasta.gz` (400 MB after processing)
+  - Use only Swiss-Prot, has the majority of PDB coverage
+- Optionally, `uniprot_trembl.fasta.gz` (250 GB after processing)
+  - Use TrEMBL to match more PDBs; might be useful for max. coverage
 
 The latter might result in (slightly) more PDBs which can be associated to a Protein. The difference is expected to be trivial.
 
-Place the files in the `uniprotkb` folder without uncompressing them.
+Place the files in the `uniprotkb` folder without decompressing them.
 
-By default, only Swiss-Prot is used. To also use TrEMBL, uncomment line 11 in `run.sh`.
+Once `run.sh` has been executed and the database `uniprotkb/uniprot_sequences.db` has been created, the files can be deleted.
 
 ### Running
 
@@ -52,8 +59,9 @@ To run the script, execute:
 ```
 
 This will
-- Compile C++ binaries
-- Process Swiss-Prot / TrEMBL into a database 
-  - (`uniprot_sprot.fasta.gz` / `uniprot_trembl.fasta.gz`) can be deleted afterwards
-- Extract 3d k-mer from the PDBs
-- TODO: process the k-mers
+- Compile C++ binaries, if they don't exist
+- Ask whether to process the uniprotkb files
+  - If yes, create the database `uniprotkb/uniprot_sequences.db`, if it doesn't exist
+- Ask for k-mer size (default: k=12)
+- Extracts 3d k-mer from the PDBs (into `pdb_output` folder)
+- Extracts k-mer of length k into `kmer.txt`, along with frequency
@@ -2,15 +2,29 @@
 #define ATOMDATAPARSER_H
 
 #include <string>
+#include <sstream>
 
 struct AtomData
 {
-    bool isValidAtom; // whether the atom is valid (CA) 
+    bool isValidAtom; // whether the atom is valid (CA)
     std::string resName; // residue name (AA)
     int resSeq; // residue sequence number
     float x, y, z;
-};
 
-void parseAtomData(const std::string& str, AtomData& data, size_t offset = 0);
+    AtomData(const std::string& str, size_t offset = 0)
+    {
+        std::string atom_name = str.substr(-offset + 12, 4);
+        std::stringstream atom_ss(atom_name);
+        atom_ss >> atom_name;
+    
+        isValidAtom = atom_name == "CA"; // whether the atom is ca
+
+        resName = str.substr(-offset + 17, 3);
+        resSeq = std::stoi(str.substr(-offset + 22, 4));
+        x = std::stof(str.substr(-offset + 30, 8));
+        y = std::stof(str.substr(-offset + 38, 8));
+        z = std::stof(str.substr(-offset + 46, 8));
+    }
+};
 
 #endif // ATOMDATAPARSER_H
@@ -1,5 +1,7 @@
 #include "Constants.h"
 
+#include <unordered_set>
+
 #define X(code, name) name,
 const char *code_name[] = {
     PDB_PARSING_CODES
@@ -16,7 +18,9 @@ const std::unordered_map<std::string, char> aminoAcidLookup = {
     {"TRP", 'W'}, {"VAL", 'V'}, {"SEC", 'U'}, {"PYL", 'O'},
     {"XPL", 'O'}, // for pdb 1L2Q
     {"GLX", 'Z'}, // for pdb 1KP0 
-    {"ASX", 'B'} // for pdb 1KP0
-    // 3e2o, 2fmd, 2atc, 4cpa
-
+    {"ASX", 'B'}, // for pdb 1KP0
+    {"UNK", '.'} // unknown AA
 };
+
+// rare amino acids = SELENOCYSTEINE, PYRROLYSINE, ..., & unknown AA
+const std::unordered_set<char> invalidAminoAcids{'U', 'O', 'Z', 'B', '.'};
@@ -2,6 +2,7 @@
 #define CONSTANTS_H
 
 #include <unordered_map>
+#include <unordered_set>
 #include <string>
 
 #define PDB_PARSING_CODES \
@@ -11,13 +12,11 @@ X(RESOLUTION_NOT_SPECIFIED, "RESOLUTION_NOT_SPECIFIED") \
 X(MISSING_NON_TERMINAL_RESIDUES, "MISSING_NON_TERMINAL_RESIDUES") \
 X(NO_ALPHA_CARBON_ATOMS_FOUND, "NO_ALPHA_CARBON_ATOMS_FOUND") \
 X(IS_NOT_PROTEIN, "IS_NOT_PROTEIN") \
-X(EXCLUDE_RARE_AMINO_ACIDS, "EXCLUDE_RARE_AMINO_ACIDS") \
+X(EXCLUDE_UNKNOWN_OR_RARE_AMINO_ACIDS, "EXCLUDE_UNKNOWN_OR_RARE_AMINO_ACIDS") \
 X(HAS_UNKNOWN_RESIDUE, "HAS_UNKNOWN_RESIDUE") \
 X(INVALID_SEQUENCE, "INVALID_SEQUENCE") \
 X(NO_UNIPROT_ID, "NO_UNIPROT_ID") \
 
-// rare amino acids = SELENOCYSTEINE, PYRROLYSINE, others
-
 #define X(code, name) code,
 enum PDBParsingCode : size_t {
     PDB_PARSING_CODES
@@ -37,6 +36,7 @@ enum PDBType {PROTEIN, DNA, RNA, MISC};
 
 extern const float MAX_RESOLUTION;
 extern const std::unordered_map<std::string, char> aminoAcidLookup;
+extern const std::unordered_set<char> invalidAminoAcids;
 
 #endif // CONSTANTS_H
 
@@ -0,0 +1,50 @@
+#ifndef PDBCONTEXT_H
+#define PDBCONTEXT_H
+
+#include <string>
+#include <vector>
+#include <sstream>
+#include <unordered_map>
+#include <unordered_set>
+
+struct PDBContext {
+    // main data
+    std::string pdbId;
+    float resolution = -1.0f;
+    std::unordered_set<std::string> uniprotIds;
+    
+    // input and output data
+    std::vector<std::string> output;
+
+    // sequence data
+    std::stringstream parsedSequence;
+    std::unordered_map<char, std::stringstream> sequenceStreams;
+
+    // residue tracking
+    int prevCAResiduePosition = -1;
+    int firstCAResidue = 0;
+
+    // condition flags
+    bool hasResiduesOutOfOrder = true;
+    bool anyCAAtomsPresent = false;
+    bool isNotProtein = false;
+    bool hasExcludedAminoAcid = false;
+
+    // error tracking
+    std::vector<std::string> errorOutput;
+
+    void resetPDBOutput() {
+        if (!anyCAAtomsPresent && output.size()) {
+            anyCAAtomsPresent = true;
+        }
+
+        output.clear();
+        hasResiduesOutOfOrder = true;
+        hasExcludedAminoAcid = false;
+        prevCAResiduePosition = -1;
+        firstCAResidue = 0;
+        parsedSequence.str("");
+    }
+};
+
+#endif // PDBCONTEXT_H
@@ -5,6 +5,7 @@
 #include <iostream>
 
 #include "Constants.h"
+#include "PDBContext.h"
 
 std::string concatenateString(const std::vector<std::string>& strings) {
     const char delim = ',';
@@ -56,11 +57,11 @@ float extractResolution(const std::string &line) {
 /// PROCESS ENTRY ///
 /////////////////////
 
-PDBType processHeader(const std::string &line) {
+PDBType processHeader(const std::string &line, PDBContext &con) {
     std::string cls = line.substr(10, 40); // 11-50
     std::string pdbId = line.substr(62, 4); // 63-66
 
-    std::cout << "pdb_id:  " << pdbId << std::endl;
+    con.pdbId = pdbId;
 
     if (cls.find("DNA") != std::string::npos) {
         if (cls.find("DNA BINDING PROTEIN") == std::string::npos)
@@ -75,31 +76,31 @@ PDBType processHeader(const std::string &line) {
 }
 
 // Remark row
-void processRemark(const std::string &line, float &resolution) {
+void processRemark(const std::string &line, PDBContext &con) {
     int remark_no = std::stoi(line.substr(7, 3));
     switch (remark_no) {
         case 2:
             int extractedRes = extractResolution(line);
             if (extractedRes != -1) {
-                resolution = extractResolution(line);
+                con.resolution = extractResolution(line);
             }
             break;
     }
 }
 
 // DBRef row
-void processDBRef(const std::string &line, std::unordered_set<std::string> &uniprotIds) {
+void processDBRef(const std::string &line, PDBContext &con) {
     std::string db = line.substr(26, 6); // 27 - 32
     if (db != "UNP   ") // only match uniprot
         return;
 
     std::string uniprotId = line.substr(33, 8); // 34 - 41
     std::stringstream parser(uniprotId);
     parser >> uniprotId;
-    uniprotIds.insert(uniprotId);
+    con.uniprotIds.insert(uniprotId);
 }
 
-void processDBRef1(const std::string &line, std::unordered_set<std::string> &uniprotIds) {
+void processDBRef1(const std::string &line, PDBContext &con) {
     // process 1 for uniprot
     std::string db = line.substr(26, 6); // 27 - 32
     if (db != "UNP   ") // only match uniprot
@@ -112,24 +113,21 @@ void processDBRef1(const std::string &line, std::unordered_set<std::string> &uni
     std::string uniprotId = nextLine.substr(18, 22); // 19 - 40
     std::stringstream parser(uniprotId);
     parser >> uniprotId;
-    uniprotIds.insert(uniprotId);
+    con.uniprotIds.insert(uniprotId);
 }
 
 // SEQRES row
-void processSequence(const std::string &line, std::unordered_map<char, std::stringstream> &sequenceStreams) {
+void processSequence(const std::string &line, PDBContext &con) {
     char chainId = line[11];
     std::string aa;
     std::string aaLine = line.substr(19, 51);
     std::stringstream aaStream(aaLine);
     while (aaStream >> aa) {
         try {
-            sequenceStreams[chainId] << aminoAcidLookup.at(aa);
+            con.sequenceStreams[chainId] << aminoAcidLookup.at(aa);
         } catch (std::out_of_range) {
             // replace non-standard AA with dot (.)
-            sequenceStreams[chainId] << '.';
-
-            // sequenceStreams.erase(chainId);
-            // throw std::out_of_range("test");
+            con.sequenceStreams[chainId] << '.';
             return;
         }
     }
 
@@ -6,15 +6,15 @@
 #include <unordered_set>
 
 #include "Constants.h"
+#include "PDBContext.h"
 
 std::string concatenateString(const std::vector<std::string>& strings);
 std::string concatenateString(const std::unordered_set<std::string>& strings);
 
-PDBType processHeader(const std::string &line);
-float extractResolution(const std::string &line);
-void processRemark(const std::string &line, float &resolution);
-void processDBRef(const std::string &line, std::unordered_set<std::string> &uniprotIds);
-void processDBRef1(const std::string &line, std::unordered_set<std::string> &uniprotIds);
-void processSequence(const std::string &line, std::unordered_map<char, std::stringstream> &sequenceStreams);
+PDBType processHeader(const std::string &line, PDBContext &con);
+void processRemark(const std::string &line, PDBContext &con);
+void processDBRef(const std::string &line, PDBContext &con);
+void processDBRef1(const std::string &line, PDBContext &con);
+void processSequence(const std::string &line, PDBContext &con);
 
 #endif // UTILS_H