|
| 1 | +#include <iostream> |
| 2 | +#include <unordered_map> |
| 3 | +#include <string> |
| 4 | +#include <vector> |
| 5 | +#include <unordered_set> |
| 6 | +#include <sstream> |
| 7 | + |
| 8 | +#include "AtomDataParser.h" |
| 9 | +#include "Constants.h" |
| 10 | +#include "Utils.h" |
| 11 | + |
| 12 | + |
| 13 | +// bool invalidSequence = false; |
| 14 | +bool invalidAA = false; |
| 15 | +bool hasUnknownResidues = false; |
| 16 | +std::stringstream parsedSequence; |
| 17 | +std::vector<std::string> errorOutput; |
| 18 | + |
| 19 | +int firstCAResidue = 0; |
| 20 | + |
| 21 | +ResidueConfirmation validateAtomSequence(int &prevCAResiduePosition, const int &resSeq) { |
| 22 | + if (prevCAResiduePosition + 1 != resSeq) { |
| 23 | + if (prevCAResiduePosition == -1) { // initial value |
| 24 | + prevCAResiduePosition = resSeq; |
| 25 | + firstCAResidue = resSeq; |
| 26 | + return RESIDUE_VALID; |
| 27 | + } |
| 28 | + |
| 29 | + if (prevCAResiduePosition == resSeq) |
| 30 | + return RESIDUE_DUPLICATE; |
| 31 | + |
| 32 | + std::stringstream errorString; |
| 33 | + errorString << "missing residues; prev=" << prevCAResiduePosition << ", next=" << resSeq; |
| 34 | + errorOutput.push_back(errorString.str()); |
| 35 | + |
| 36 | + prevCAResiduePosition = resSeq; |
| 37 | + return RESIDUE_OUT_OF_SEQUENCE; |
| 38 | + } |
| 39 | + prevCAResiduePosition = resSeq; |
| 40 | + return RESIDUE_VALID; |
| 41 | +} |
| 42 | + |
| 43 | +////////////////////////// |
| 44 | +////////////////////////// |
| 45 | +///// PROCESS ROWS /////// |
| 46 | +////////////////////////// |
| 47 | +////////////////////////// |
| 48 | + |
| 49 | +void processAtom(std::string &line, std::vector<std::string> &output, int &prevCAResiduePosition, bool &isSequenceValid) { |
| 50 | + AtomData data; |
| 51 | + parseAtomData(line, data, 0); |
| 52 | + if (!data.isValidAtom) { |
| 53 | + return; |
| 54 | + } |
| 55 | + |
| 56 | + switch(validateAtomSequence(prevCAResiduePosition, data.resSeq)) { |
| 57 | + case RESIDUE_VALID: |
| 58 | + break; // continue |
| 59 | + case RESIDUE_DUPLICATE: |
| 60 | + return; // skip to next |
| 61 | + case RESIDUE_OUT_OF_SEQUENCE: |
| 62 | + isSequenceValid = false; |
| 63 | + break; |
| 64 | + } |
| 65 | + |
| 66 | + char aminoAcid; |
| 67 | + try { |
| 68 | + aminoAcid = aminoAcidLookup.at(data.resName); |
| 69 | + } catch (std::out_of_range) { // should never throw if pdb is valid & not unknown |
| 70 | + if (data.resName == "UNK") { |
| 71 | + hasUnknownResidues = true; |
| 72 | + return; |
| 73 | + } |
| 74 | + |
| 75 | + throw std::runtime_error("Unexpected atom type: " + data.resName); |
| 76 | + } |
| 77 | + |
| 78 | + // Selenocysteine, Pyrrolysine, GLX, ASX, too rare, skip |
| 79 | + if (aminoAcid == 'U' || aminoAcid == 'O' || aminoAcid == 'Z' || aminoAcid == 'B') { |
| 80 | + invalidAA = true; |
| 81 | + } |
| 82 | + |
| 83 | + // construct output string |
| 84 | + std::stringstream ss; |
| 85 | + ss << aminoAcid << ' ' << data.x << ' ' << data.y << ' ' << data.z; |
| 86 | + // std::cout << aminoAcid << ' ' << data.x << ' ' << data.y << ' ' << data.z << std::endl; |
| 87 | + |
| 88 | + // construct sequence string |
| 89 | + parsedSequence << aminoAcid; |
| 90 | + |
| 91 | + output.push_back(ss.str()); |
| 92 | +} |
| 93 | + |
| 94 | +// Checks whether the parsed input, so far, produced a valid, sequential |
| 95 | +// list of residues with coordinates. |
| 96 | +// Returns PDBParsingCode.SUCCESS if successful, and a specific error code |
| 97 | +// otherwise. |
| 98 | +PDBParsingCode isPDBInvalid(float &resolution, bool &isSequenceValid, int &prevCAResiduePosition, bool &anyCAAtomsPresent) { |
| 99 | + bool isResolutionValid = resolution < 2.5; |
| 100 | + if (!isResolutionValid) { // resolution too low |
| 101 | + return RESOLUTION_TOO_LOW; |
| 102 | + } |
| 103 | + |
| 104 | + if (resolution == -1) // no valid resolution remark returned |
| 105 | + return RESOLUTION_NOT_SPECIFIED; |
| 106 | + |
| 107 | + if (!isSequenceValid) // missing non-terminal residues |
| 108 | + return MISSING_NON_TERMINAL_RESIDUES; |
| 109 | + |
| 110 | + if (prevCAResiduePosition == -1) { // no single CA atom found |
| 111 | + if (anyCAAtomsPresent) // if any model had, but last one didn't |
| 112 | + return MISSING_NON_TERMINAL_RESIDUES; |
| 113 | + return NO_ALPHA_CARBON_ATOMS_FOUND; |
| 114 | + } |
| 115 | + |
| 116 | + return SUCCESS; |
| 117 | +} |
| 118 | + |
| 119 | +PDBParsingCode isPDBInvalid(float &resolution, bool &isSequenceValid, int &prevCAResiduePosition, bool &anyCAAtomsPresent, bool &isNotProtein) { |
| 120 | + if (isNotProtein) |
| 121 | + return IS_NOT_PROTEIN; |
| 122 | + // if (invalidSequence) |
| 123 | + // return INVALID_SEQUENCE; |
| 124 | + if (hasUnknownResidues) |
| 125 | + return HAS_UNKNOWN_RESIDUE; |
| 126 | + if (invalidAA) |
| 127 | + return EXCLUDE_RARE_AMINO_ACIDS; |
| 128 | + return isPDBInvalid(resolution, isSequenceValid, prevCAResiduePosition, anyCAAtomsPresent); |
| 129 | +} |
| 130 | + |
| 131 | +void resetPDBOutput(std::vector<std::string> &output, bool &isSequenceValid, int &prevCAResiduePosition, bool &anyCAAtomsPresent) { |
| 132 | + if (!anyCAAtomsPresent && output.size()) { |
| 133 | + anyCAAtomsPresent = true; |
| 134 | + } |
| 135 | + |
| 136 | + output.clear(); |
| 137 | + isSequenceValid = true; |
| 138 | + hasUnknownResidues = false; |
| 139 | + invalidAA = false; |
| 140 | + prevCAResiduePosition = -1; |
| 141 | + firstCAResidue = 0; |
| 142 | + parsedSequence.str(""); |
| 143 | +} |
| 144 | + |
| 145 | +bool hasMatched = true; |
| 146 | + |
| 147 | +std::vector<std::string> processSequences(std::unordered_map<char, std::stringstream> &sequenceStreams) { |
| 148 | + std::unordered_set<std::string> uniqueSequences; |
| 149 | + std::vector<std::string> sequences; |
| 150 | + std::string matchedSequence = "N/A"; |
| 151 | + |
| 152 | + if (sequenceStreams.size() == 0) |
| 153 | + return sequences; |
| 154 | + |
| 155 | + for (const auto & [_chainId, stream] : sequenceStreams) { |
| 156 | + auto sequence = stream.str(); |
| 157 | + if (sequence.size() != 0) { |
| 158 | + auto sequencePosition = sequence.find(parsedSequence.str()); |
| 159 | + if (parsedSequence.str().size() > 0 && sequencePosition != std::string::npos) { |
| 160 | + matchedSequence = sequence; |
| 161 | + } else { |
| 162 | + uniqueSequences.insert(sequence); |
| 163 | + } |
| 164 | + } |
| 165 | + } |
| 166 | + |
| 167 | + // line 4: matched sequence (parsed contained within matched) |
| 168 | + if (matchedSequence != "") |
| 169 | + std::cout << "matched: " << matchedSequence << std::endl; |
| 170 | + // line 5: sequence parsed from ATOM records |
| 171 | + std::cout << "parsed: " << parsedSequence.str() << std::endl; |
| 172 | + |
| 173 | + sequences.push_back(matchedSequence); |
| 174 | + |
| 175 | + // line 6+: all other parsed sequences |
| 176 | + for (std::string seq: uniqueSequences) { |
| 177 | + std::cout << "other: " << seq << std::endl; |
| 178 | + sequences.push_back(seq); |
| 179 | + } |
| 180 | + |
| 181 | + if (matchedSequence == "N/A") |
| 182 | + hasMatched = false; |
| 183 | + |
| 184 | + return sequences; |
| 185 | +} |
| 186 | + |
| 187 | +// Small script for parsing PDB files. |
| 188 | +// Takes in a stream of a PDB file as input. |
| 189 | +int main() { |
| 190 | + std::ios_base::sync_with_stdio(false); |
| 191 | + std::cin.tie(NULL); |
| 192 | + |
| 193 | + std::vector<std::string> output; |
| 194 | + std::unordered_set<std::string> uniprotIds; |
| 195 | + bool isSequenceValid = true; |
| 196 | + bool processed_atom = false; |
| 197 | + bool anyCAAtomsPresent = false; |
| 198 | + bool isNotProtein = false; |
| 199 | + auto resolution = -1.0f; |
| 200 | + int prevCAResiduePosition = -1; |
| 201 | + |
| 202 | + std::unordered_map<char, std::stringstream> sequenceStreams; |
| 203 | + |
| 204 | + std::string line; |
| 205 | + while (getline(std::cin, line)) { |
| 206 | + std::string param = line.substr(0, 6); |
| 207 | + |
| 208 | + if (param == "HEADER") { |
| 209 | + auto headerType = processHeader(line); |
| 210 | + if (headerType != PROTEIN) { |
| 211 | + isNotProtein = true; |
| 212 | + // break; |
| 213 | + } |
| 214 | + } else if (param == "REMARK") { |
| 215 | + processRemark(line, resolution); |
| 216 | + } else if (param == "DBREF ") { |
| 217 | + processDBRef(line, uniprotIds); |
| 218 | + } else if (param == "DBREF1") { |
| 219 | + processDBRef1(line, uniprotIds); |
| 220 | + } else if (param == "SEQRES") { |
| 221 | + processSequence(line, sequenceStreams); |
| 222 | + } else if (param == "ATOM ") { // HETATM residues are skipped |
| 223 | + processAtom(line, output, prevCAResiduePosition, isSequenceValid); |
| 224 | + } else if (param == "TER ") { // end of one chain |
| 225 | + auto pdbValidity = isPDBInvalid(resolution, isSequenceValid, prevCAResiduePosition, anyCAAtomsPresent); |
| 226 | + // std::cout << code_name[pdbValidity] << std::endl; |
| 227 | + if (pdbValidity == SUCCESS) |
| 228 | + break; // terminate parser, output PDB |
| 229 | + |
| 230 | + // if at first you don't succeed, try, try again (parse next model) |
| 231 | + resetPDBOutput(output, isSequenceValid, prevCAResiduePosition, anyCAAtomsPresent); |
| 232 | + } // else ignore line, until end is reached |
| 233 | + } |
| 234 | + |
| 235 | + // line 1 -- pdb id |
| 236 | + // "pdb_id: 201L" (printed in Utils.cpp) |
| 237 | + |
| 238 | + // line 2 -- resolution |
| 239 | + std::cout << "resolut: " << resolution << std::endl; |
| 240 | + |
| 241 | + // line 3 -- uniprot IDs |
| 242 | + std::string allUniprotIds = concatenateString(uniprotIds); |
| 243 | + std::cout << "uniprot: " << allUniprotIds << std::endl; |
| 244 | + |
| 245 | + // line 4 -- matched sequence (atom record substring of reqres) |
| 246 | + // line 5 -- parsed sequence (atom records) |
| 247 | + // line 6-n -- other sequences (reqres sequence) |
| 248 | + std::vector<std::string> sequences = processSequences(sequenceStreams); |
| 249 | + |
| 250 | + |
| 251 | + auto pdbValidity = isPDBInvalid(resolution, isSequenceValid, prevCAResiduePosition, anyCAAtomsPresent, isNotProtein); |
| 252 | + if (pdbValidity != SUCCESS) { |
| 253 | + std::cerr << code_name[pdbValidity] << std::endl; |
| 254 | + |
| 255 | + for (auto error : errorOutput) |
| 256 | + std::cout << error << std::endl; |
| 257 | + return pdbValidity; |
| 258 | + } |
| 259 | + |
| 260 | + if (uniprotIds.size() == 0) { |
| 261 | + std::cerr << code_name[NO_UNIPROT_ID] << std::endl; |
| 262 | + return NO_UNIPROT_ID; |
| 263 | + } |
| 264 | + |
| 265 | + // if (!hasMatched) { |
| 266 | + // std::cerr << "NO_MATCH" << std::endl; |
| 267 | + // return 103; |
| 268 | + // // std::cerr << "NO MATCHED SEQUENCE" << std::endl; |
| 269 | + // } |
| 270 | + |
| 271 | + // line n+1: sequence number of initial residue (starts with 1) |
| 272 | + std::cout << "initres: " << firstCAResidue << std::endl; |
| 273 | + |
| 274 | + // line n+2: empty line |
| 275 | + std::cout << std::endl; |
| 276 | + |
| 277 | + // lines n+3 to end: coordinates in format <residue> <x> <y> <z> |
| 278 | + for (std::string pos : output) { |
| 279 | + std::cout << pos << std::endl; |
| 280 | + } |
| 281 | + |
| 282 | + return 0; |
| 283 | +} |
| 284 | + |
0 commit comments