Skip to content

Commit dc52098

Browse files
committed
changes for using prostt5
1 parent e739fef commit dc52098

File tree

2 files changed

+107
-52
lines changed

2 files changed

+107
-52
lines changed

src/strucclustutils/msa2lddt.cpp

+78-38
Original file line numberDiff line numberDiff line change
@@ -369,8 +369,22 @@ int msa2lddt(int argc, const char **argv, const Command& command, int makeReport
369369
seqDbrAA.open(DBReader<unsigned int>::NOSORT);
370370
DBReader<unsigned int> seqDbr3Di((par.db1+"_ss").c_str(), (par.db1+"_ss.index").c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
371371
seqDbr3Di.open(DBReader<unsigned int>::NOSORT);
372-
DBReader<unsigned int> seqDbrCA((par.db1+"_ca").c_str(), (par.db1+"_ca.index").c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
373-
seqDbrCA.open(DBReader<unsigned int>::NOSORT);
372+
373+
// Check for CA database
374+
DBReader<unsigned int> *seqDbrCA = NULL;
375+
bool caExist = FileUtil::fileExists((par.db1 + "_ca.dbtype").c_str());
376+
if (caExist == false) {
377+
Debug(Debug::INFO) << "Did not find " << FileUtil::baseName(par.db1) << " C-alpha database, not calculating LDDT\n";
378+
} else {
379+
seqDbrCA = new DBReader<unsigned int>(
380+
(par.db1 + "_ca").c_str(),
381+
(par.db1 + "_ca.index").c_str(),
382+
par.threads,
383+
DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA
384+
);
385+
seqDbrCA->open(DBReader<unsigned int>::NOSORT);
386+
}
387+
374388
IndexReader headerDB(par.db1, par.threads, IndexReader::HEADERS, touch ? IndexReader::PRELOAD_INDEX : 0);
375389

376390
// Read in MSA, mapping headers to database indices
@@ -394,20 +408,21 @@ int msa2lddt(int argc, const char **argv, const Command& command, int makeReport
394408
for (size_t i = 0; i < subset.size(); i++)
395409
subset[i] = i;
396410

397-
std::tie(perColumnScore, perColumnCount, lddtScore, numCols) = calculate_lddt(cigars_aa, subset, indices, lengths, &seqDbrCA, par.pairThreshold);
398-
399-
400-
std::string scores;
401-
for (float score : perColumnScore) {
402-
if (scores.length() > 0) scores += ",";
403-
scores += std::to_string(score);
411+
if (caExist) {
412+
std::tie(perColumnScore, perColumnCount, lddtScore, numCols) = calculate_lddt(cigars_aa, subset, indices, lengths, seqDbrCA, par.pairThreshold);
413+
std::string scores;
414+
for (float score : perColumnScore) {
415+
if (scores.length() > 0) scores += ",";
416+
scores += std::to_string(score);
417+
}
418+
Debug(Debug::INFO) << "Average MSA LDDT: " << lddtScore << '\n';
419+
Debug(Debug::INFO) << "Columns considered: " << numCols << "/" << alnLength << '\n';
420+
Debug(Debug::INFO) << "Column scores: " << scores << '\n';
404421
}
405-
std::cout << "Average MSA LDDT: " << lddtScore << '\n';
406-
std::cout << "Columns considered: " << numCols << "/" << alnLength << '\n';
407-
std::cout << "Column scores: " << scores << '\n';
408422

409423
// Write clustal format MSA HTML
410424
if (makeReport) {
425+
Debug(Debug::INFO) << "Generating report\n";
411426
DBWriter resultWriter(par.db3.c_str(), (par.db3 + ".index").c_str(), static_cast<unsigned int>(par.threads), par.compressed, Parameters::DBTYPE_OMIT_FILE);
412427
resultWriter.open();
413428

@@ -470,34 +485,46 @@ R"html(<!DOCTYPE html>
470485
for (size_t i = 0; i < cigars_aa.size(); i++) {
471486
std::string seq_aa = expand(cigars_aa[i]);
472487
std::string seq_ss = expand(cigars_ss[i]);
473-
std::string seq_ca = getXYZstring(indices[i], lengths[i], &seqDbrCA);
474488
std::string entry;
475489
entry.append("{\"name\":\"");
476490
entry.append(headers[i]);
477491
entry.append("\",\"aa\": \"");
478492
entry.append(seq_aa);
479493
entry.append("\",\"ss\": \"");
480494
entry.append(seq_ss);
481-
entry.append("\",\"ca\": \"");
482-
entry.append(seq_ca);
483-
entry.append("\"}");
484-
if (i != cigars_aa.size() - 1)
495+
entry.append("\"");
496+
if (caExist) {
497+
std::string seq_ca = getXYZstring(indices[i], lengths[i], seqDbrCA);
498+
entry.append(",\"ca\": \"");
499+
entry.append(seq_ca);
500+
entry.append("\"");
501+
}
502+
entry.append("}");
503+
if (i != cigars_aa.size() - 1) {
485504
entry.append(",");
505+
} else {
506+
entry.append("]");
507+
}
486508
resultWriter.writeData(entry.c_str(), entry.length(), 0, 0, false, false);
487-
}
488-
489-
std::string middle = "],\"scores\": [";
490-
resultWriter.writeData(middle.c_str(), middle.length(), 0, 0, false, false);
509+
}
491510

492511
// Per-column scores, as [score, score, ...]
493-
// TODO: optionally save this as .csv
494-
for (int i = 0; i < alnLength; i++) {
495-
std::string entry = (perColumnCount[i] == 0) ? "-1" : std::to_string(perColumnScore[i]);
496-
if (i != alnLength - 1)
497-
entry.append(",");
498-
resultWriter.writeData(entry.c_str(), entry.length(), 0, 0, false, false);
512+
std::string middle = "";
513+
514+
if (caExist) {
515+
middle.append(",\"scores\": [");
516+
for (int i = 0; i < alnLength; i++) {
517+
std::string entry = (perColumnCount[i] == 0) ? "-1" : std::to_string(perColumnScore[i]);
518+
if (i != alnLength - 1) {
519+
entry.append(",");
520+
}
521+
middle.append(entry);
522+
}
523+
middle.append("]");
499524
}
500-
std::string end = "],";
525+
resultWriter.writeData(middle.c_str(), middle.length(), 0, 0, false, false);
526+
527+
std::string end = "";
501528

502529
if (par.guideTree != "") {
503530
std::string tree;
@@ -508,25 +535,37 @@ R"html(<!DOCTYPE html>
508535
tree += line;
509536
newick.close();
510537
}
511-
end.append("\"tree\": \"");
538+
end.append(",\"tree\": \"");
512539
end.append(tree);
513-
end.append("\",");
540+
end.append("\"");
514541
}
515-
end.append("\"statistics\": {");
542+
end.append(",\"statistics\": {");
543+
544+
bool hasPrev = false;
516545
if (par.reportPaths) {
517546
end.append("\"db\":\"");
518547
end.append(par.db1);
519548
end.append("\",\"msaFile\":\"");
520549
end.append(par.db2);
521-
end.append("\",");
550+
end.append("\"");
551+
hasPrev = true;
552+
}
553+
if (caExist) {
554+
if (hasPrev) {
555+
end.append(",");
556+
}
557+
end.append("\"msaLDDT\":");
558+
end.append(std::to_string(lddtScore));
559+
hasPrev = true;
522560
}
523-
end.append("\"msaLDDT\":");
524-
end.append(std::to_string(lddtScore));
525-
526561
if (par.reportCommand != "") {
527-
end.append(",\"cmdString\":\"");
562+
if (hasPrev) {
563+
end.append(",");
564+
}
565+
end.append("\"cmdString\":\"");
528566
end.append(par.reportCommand);
529567
end.append("\"");
568+
hasPrev = true;
530569
}
531570
end.append("}}");
532571

@@ -544,9 +583,10 @@ R"html(<!DOCTYPE html>
544583
}
545584

546585
seqDbrAA.close();
547-
seqDbrCA.close();
548586
seqDbr3Di.close();
549-
587+
if (caExist) {
588+
seqDbrCA->close();
589+
}
550590
return EXIT_SUCCESS;
551591
}
552592

src/strucclustutils/structuremsa.cpp

+29-14
Original file line numberDiff line numberDiff line change
@@ -1054,26 +1054,26 @@ Matcher::result_t pairwiseTMAlign(
10541054
int mergedId,
10551055
int targetId,
10561056
DBReader<unsigned int> &seqDbrAA,
1057-
DBReader<unsigned int> &seqDbrCA
1057+
DBReader<unsigned int> *seqDbrCA
10581058
) {
10591059
int qLen = seqDbrAA.getSeqLen(mergedId);
10601060
int tLen = seqDbrAA.getSeqLen(targetId);
10611061

10621062
unsigned int qKey = seqDbrAA.getDbKey(mergedId);
1063-
size_t qCaId = seqDbrCA.getId(qKey);
1063+
size_t qCaId = seqDbrCA->getId(qKey);
10641064

10651065
unsigned int tKey = seqDbrAA.getDbKey(targetId);
1066-
size_t tCaId = seqDbrCA.getId(tKey);
1066+
size_t tCaId = seqDbrCA->getId(tKey);
10671067

10681068
Coordinate16 qcoords;
1069-
char *qcadata = seqDbrCA.getData(qCaId, 0);
1070-
size_t qCaLength = seqDbrCA.getEntryLen(qCaId);
1069+
char *qcadata = seqDbrCA->getData(qCaId, 0);
1070+
size_t qCaLength = seqDbrCA->getEntryLen(qCaId);
10711071
float *qCaData = qcoords.read(qcadata, qLen, qCaLength);
10721072
char *merged_aa_seq = seqDbrAA.getData(qCaId, 0);
10731073

10741074
Coordinate16 tcoords;
1075-
char *tcadata = seqDbrCA.getData(tCaId, 0);
1076-
size_t tCaLength = seqDbrCA.getEntryLen(tCaId);
1075+
char *tcadata = seqDbrCA->getData(tCaId, 0);
1076+
size_t tCaLength = seqDbrCA->getEntryLen(tCaId);
10771077
float *tCaData = tcoords.read(tcadata, tLen, tCaLength);
10781078
char *target_aa_seq = seqDbrAA.getData(tCaId, 0);
10791079

@@ -1117,8 +1117,21 @@ int structuremsa(int argc, const char **argv, const Command& command, bool preCl
11171117
seqDbrAA.open(DBReader<unsigned int>::NOSORT);
11181118
DBReader<unsigned int> seqDbr3Di((par.db1+"_ss").c_str(), (par.db1+"_ss.index").c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
11191119
seqDbr3Di.open(DBReader<unsigned int>::NOSORT);
1120-
DBReader<unsigned int> seqDbrCA((par.db1+"_ca").c_str(), (par.db1+"_ca.index").c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
1121-
seqDbrCA.open(DBReader<unsigned int>::NOSORT);
1120+
1121+
// Check for CA database
1122+
DBReader<unsigned int> *seqDbrCA = NULL;
1123+
bool caExist = FileUtil::fileExists((par.db1 + "_ca.dbtype").c_str());
1124+
if (caExist == false) {
1125+
Debug(Debug::INFO) << "Did not find " << FileUtil::baseName(par.db1) << " C-alpha database, not using\n";
1126+
} else {
1127+
seqDbrCA = new DBReader<unsigned int>(
1128+
(par.db1 + "_ca").c_str(),
1129+
(par.db1 + "_ca.index").c_str(),
1130+
par.threads,
1131+
DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA
1132+
);
1133+
seqDbrCA->open(DBReader<unsigned int>::NOSORT);
1134+
}
11221135

11231136
IndexReader qdbrH(par.db1, par.threads, IndexReader::HEADERS, touch ? IndexReader::PRELOAD_INDEX : 0);
11241137

@@ -1512,7 +1525,7 @@ int structuremsa(int argc, const char **argv, const Command& command, bool preCl
15121525
// If neither are profiles, do TM-align as well and take the best alignment
15131526
bool tmaligned = false;
15141527
// if (false) {
1515-
if (!queryIsProfile && !targetIsProfile) {
1528+
if (caExist && !queryIsProfile && !targetIsProfile) {
15161529
Matcher::result_t tmRes = pairwiseTMAlign(mergedId, targetId, seqDbrAA, seqDbrCA);
15171530
std::vector<Instruction> qBtTM;
15181531
std::vector<Instruction> tBtTM;
@@ -1549,7 +1562,7 @@ int structuremsa(int argc, const char **argv, const Command& command, bool preCl
15491562
std::vector<size_t> indices_tm = { dbKeys[mergedId], dbKeys[targetId] };
15501563
std::vector<int> lengths_tm = { seqLens[mergedId], seqLens[targetId] };
15511564

1552-
float lddtTM = std::get<2>(calculate_lddt(cigars_tm, subset_tm, indices_tm, lengths_tm, &seqDbrCA, par.pairThreshold));
1565+
float lddtTM = std::get<2>(calculate_lddt(cigars_tm, subset_tm, indices_tm, lengths_tm, seqDbrCA, par.pairThreshold));
15531566
// std::cout << "got TM lddt: " << lddtTM << '\n';
15541567

15551568
// adjust cigars with 3Di alignment result
@@ -1578,7 +1591,7 @@ int structuremsa(int argc, const char **argv, const Command& command, bool preCl
15781591
// std::cout << expand(query_aa) << '\n';
15791592
// std::cout << expand(target_aa) << '\n';
15801593

1581-
float lddt3Di = std::get<2>(calculate_lddt(cigars_tm, subset_tm, indices_tm, lengths_tm, &seqDbrCA, par.pairThreshold));
1594+
float lddt3Di = std::get<2>(calculate_lddt(cigars_tm, subset_tm, indices_tm, lengths_tm, seqDbrCA, par.pairThreshold));
15821595
// std::cout << "got 3Di lddt: " << lddt3Di << '\n';
15831596

15841597
if (lddtTM > lddt3Di) {
@@ -1686,7 +1699,7 @@ int structuremsa(int argc, const char **argv, const Command& command, bool preCl
16861699
{
16871700
if (par.refineIters > 0) {
16881701
refineMany(
1689-
tinySubMatAA, tinySubMat3Di, &seqDbrCA, cigars_aa, cigars_ss, calculator_aa,
1702+
tinySubMatAA, tinySubMat3Di, seqDbrCA, cigars_aa, cigars_ss, calculator_aa,
16901703
filter_aa, subMat_aa, calculator_3di, filter_3di, subMat_3di, structureSmithWaterman,
16911704
par.refineIters, par.compBiasCorrection, par.wg, par.filterMaxSeqId, par.qsc,
16921705
par.Ndiff, par.covMSAThr, par.filterMinEnable, par.filterMsa, par.gapExtend.values.aminoacid(),
@@ -1749,7 +1762,9 @@ int structuremsa(int argc, const char **argv, const Command& command, bool preCl
17491762
free(tinySubMat3Di);
17501763
seqDbrAA.close();
17511764
seqDbr3Di.close();
1752-
seqDbrCA.close();
1765+
if (caExist) {
1766+
seqDbrCA->close();
1767+
}
17531768

17541769
return EXIT_SUCCESS;
17551770
}

0 commit comments

Comments
 (0)