-
Notifications
You must be signed in to change notification settings - Fork 125
fastq_syncpairs: implementation prototype
Frédéric Mahé edited this page Mar 1, 2023
·
2 revisions
Here is a C++11 draft-implementation of a potential memory-friendly fastq_syncpairs
command (re-synchronize R1-R2 fastq files). This is not a priority, so I use this wiki page as an archival system.
#include <fstream>
#include <iostream>
#include <limits>
#include <unordered_map>
#include <utility> // std::pair
#include <string>
// Summary:
// - assume entries on 4 lines, with headers ending with '/1' or '/2', no empty entries
// - for each R2 entry, record header string, begin and end,
// - for each R1 entry, if present in R2, then write to 'synced' files,
// - if not, write to 'orphan' files
// Properties:
// - C+11, no warning
// - fast enough (75 seconds for 9 million paired reads),
// - memory footprint is reduced (roughly 1/4 of the size of the R2 file),
// - works with files with more than 2 billion chars (int64),
// - R1 is read only once,
// - reads are sorted in R1's order,
// - R1 orphans are in R1 file's order, but R2 orphans are in no particular order
// const std::string R1_file {"input_R1.fastq"};
const std::string R1_file {"/tmp/Neotropical_soils_plastidial/data/2016-10-13/161007_SN234_A_L001_GYJ-7_R1.fastq"};
// const std::string R2_file {"input_R2.fastq"};
const std::string R2_file {"/tmp/Neotropical_soils_plastidial/data/2016-10-13/161007_SN234_A_L001_GYJ-7_R2.fastq"};
using Map = std::unordered_map<std::string, std::pair<long int, long int>>;
auto parse_R2 () -> Map {
std::cout << "parse R2" << std::endl;
std::ifstream R2_entries {R2_file};
std::string header;
constexpr auto unlimited_length { std::numeric_limits<std::streamsize>::max() };
Map R2_map = {};
auto start {R2_entries.tellg()};
while (std::getline(R2_entries, header))
{
R2_entries.ignore(unlimited_length, '\n'); // sequence string
R2_entries.ignore(unlimited_length, '\n'); // quality header
R2_entries.ignore(unlimited_length, '\n'); // quality string
const auto end {R2_entries.tellg()};
R2_map[header] = {start, end - start};
start = end;
}
return R2_map;
}
auto longest_R2 (const Map& R2_map) -> long unsigned int {
std::cout << "search longest R2 entry" << std::endl;
auto max_length {0L};
for (auto const& entry : R2_map) {
const auto offset { entry.second.second };
if (offset > max_length) {
max_length = offset;
}
}
return static_cast<long unsigned int>(max_length);
}
auto fetch_R2 (const long int start,
const long int offset,
std::string & buffer,
std::ifstream& R2_entries,
std::ofstream& synced_R2) -> void {
R2_entries.seekg(start);
R2_entries.read(&buffer[0], offset);
synced_R2.write(&buffer[0], offset);
buffer.clear();
}
auto read_R1 (Map& R2_map, std::string & buffer) -> void {
std::cout << "read R1" << std::endl;
std::ifstream R1_entries {R1_file};
std::ifstream R2_entries {R2_file};
std::ofstream synced_R1 {"synced_R1.fastq"};
std::ofstream orphan_R1 {"orphan_R1.fastq"};
std::ofstream synced_R2 {"synced_R2.fastq"};
std::string line;
std::string header;
while (std::getline(R1_entries, line)) {
header = line;
header.replace(header.length() - 1, 1, "2");
const auto match { R2_map.find(header) }; // C++20: map.contains()
if (match != R2_map.end()) {
const auto start {match->second.first};
const auto offset {match->second.second};
fetch_R2(start, offset, buffer, R2_entries, synced_R2);
R2_map.erase(match);
synced_R1 << line << '\n';
std::getline(R1_entries, line);
synced_R1 << line << '\n';
std::getline(R1_entries, line);
synced_R1 << line << '\n';
std::getline(R1_entries, line);
synced_R1 << line << '\n';
} else {
orphan_R1 << line << '\n';
std::getline(R1_entries, line);
orphan_R1 << line << '\n';
std::getline(R1_entries, line);
orphan_R1 << line << '\n';
std::getline(R1_entries, line);
orphan_R1 << line << '\n';
}
}
}
auto flush_R2 (const Map& R2_map, std::string & buffer) -> void {
std::cout << "ouput orphan R2 entries" << std::endl;
std::ifstream R2_entries {R2_file};
std::ofstream orphan_R2 {"orphan_R2.fastq"};
for (auto const& entry : R2_map) {
const auto start {entry.second.first};
const auto offset {entry.second.second};
R2_entries.seekg(start);
R2_entries.read(&buffer[0], offset);
orphan_R2.write(&buffer[0], offset);
}
}
auto main() -> int {
// printf is not used
std::ios_base::sync_with_stdio(false);
// map R2 entries
Map R2_map = parse_R2();
// find max entry length
const auto max_length { longest_R2(R2_map) };
std::string buffer (max_length, '\0');
// read R1
read_R1(R2_map, buffer);
// deal with remaining R2 entries
flush_R2(R2_map, buffer);
return 0;
}