Skip to content

Commit

Permalink
Merge pull request #7 from UCSC-nanopore-cgl/small_fixes
Browse files Browse the repository at this point in the history
Small fixes
  • Loading branch information
tpesout authored Sep 16, 2019
2 parents 7b1e687 + c84eec3 commit 3eeabfc
Show file tree
Hide file tree
Showing 5 changed files with 267 additions and 116 deletions.
20 changes: 3 additions & 17 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ project(marginPhase VERSION 1.1.0)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -DNDEBUG -std=gnu99 -D_XOPEN_SOURCE=500 -D_POSIX_C_SOURCE=200112L -mpopcnt ")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fno-inline -UNDEBUG -fsanitize=address")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fno-inline -UNDEBUG -fsanitize=address -static-libasan")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fno-inline -UNDEBUG")

project(marginPhase LANGUAGES C)
Expand Down Expand Up @@ -308,20 +308,6 @@ set(SOURCE_FILES
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

# Dependencies
find_library(ZLIB NAMES z)
if(${ZLIB} STREQUAL "ZLIB-NOTFOUND")
message(WARNING "Couldn't find the 'z' library")
endif()

find_library(BZ2LIB bz2)
if(${BZ2LIB} STREQUAL "BZ2LIB-NOTFOUND")
message(WARNING "Couldn't find the 'bz2' library")
endif()

find_library(CURLLIB curl)
if(${CURLLIB} STREQUAL "CURLLIB-NOTFOUND")
message(WARNING "Couldn't find the 'curl' library")
endif()

find_package(OpenMP)
if (OPENMP_FOUND)
Expand Down Expand Up @@ -378,14 +364,14 @@ add_custom_command(
OUTPUT "${PROJECT_SOURCE_DIR}/externalTools/htslib/config.h"
COMMAND autoconf
COMMAND autoheader
COMMAND ./configure --disable-lzma --disable-s3 --disable-plugins --disable-bz2
COMMAND ./configure --disable-lzma --disable-s3 --disable-plugins --disable-bz2 --disable-libcurl
COMMAND make
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/externalTools/htslib/
)
add_custom_target(HTSLIB_CONFIGURED DEPENDS "${PROJECT_SOURCE_DIR}/externalTools/htslib/config.h")
add_library(hts ${HTSLIB_SOURCES})
add_dependencies(hts HTSLIB_CONFIGURED)
target_link_libraries(hts pthread bz2 z curl)
target_link_libraries(hts pthread )

# Build MarginCore Library
add_library(margin ${SOURCE_FILES})
Expand Down
97 changes: 97 additions & 0 deletions impl/chunker.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* Released under the MIT license, see LICENSE.txt
*/

#include <omp.h>
#include "margin.h"

BamChunkRead *bamChunkRead_construct() {
Expand Down Expand Up @@ -65,3 +66,99 @@ void bamChunkRead_destruct(BamChunkRead *r) {
free(r);
}

char *mergeContigChunks(char **chunks, int64_t startIdx, int64_t endIdxExclusive, int64_t overlap, Params *params,
char *missingChunkSpacer) {

// merge chunks
stList *polishedReferenceStrings = stList_construct3(0, free); // The polished reference strings, one for each chunk

for (int64_t chunkIdx = startIdx; chunkIdx < endIdxExclusive; chunkIdx++) {
// Get chunk and polished
char* currentChunk = chunks[chunkIdx];

if (stList_length(polishedReferenceStrings) == 0) {
// special case for first chunk
// we must copy the first one because the original and merged strings are freed separately
currentChunk = stString_copy(currentChunk);
} else {
char *previousChunk = stList_peek(polishedReferenceStrings);

// Trim the currrent and previous polished reference strings to remove overlap
int64_t prefixStringCropEnd, suffixStringCropStart;
int64_t overlapMatchWeight = removeOverlap(previousChunk, currentChunk,
overlap, params->polishParams,
&prefixStringCropEnd, &suffixStringCropStart);

// we have an overlap
if (overlapMatchWeight > 0) {
st_logInfo(
" Removed overlap between neighbouring chunks. Approx overlap size: %i, overlap-match weight: %f, "
"left-trim: %i, right-trim: %i:\n", overlap,
(float) overlapMatchWeight / PAIR_ALIGNMENT_PROB_1,
strlen(previousChunk) - prefixStringCropEnd, suffixStringCropStart);

// Crop the suffix of the previous chunk's polished reference string
previousChunk[prefixStringCropEnd] = '\0';

// Crop the the prefix of the current chunk's polished reference string
currentChunk = stString_copy(&(currentChunk[suffixStringCropStart]));

// no good alignment, likely missing chunks but have to be able to handle freaky situations also
} else {
if (strlen(currentChunk) == 0) {
// missing chunk
st_logInfo(" No overlap found. Filling empty chunk with Ns.\n");
currentChunk = stString_copy(missingChunkSpacer);
} else if (overlap == 0) {
// poorly configured but could be done (freaky)
currentChunk = stString_copy(currentChunk);
} else {
// couldn't find an overlap (freaky)
st_logInfo(" No overlap found. Filling Ns in stitch position.\n");
stList_append(polishedReferenceStrings, stString_copy("NNNNNNNNNN"));
currentChunk = stString_copy(currentChunk);
}
}
}

// Add the polished sequence to the list of polished reference sequence chunks
stList_append(polishedReferenceStrings, currentChunk);
}

// finish
char *merged = stString_join2("", polishedReferenceStrings);
stList_destruct(polishedReferenceStrings);
return merged;
}

char *mergeContigChunksThreaded(char **chunks, int64_t startIdx, int64_t endIdxExclusive, int64_t numThreads,
int64_t overlap, Params *params, char *missingChunkSpacer) {

// special unthreaded case
if (numThreads == 1) return mergeContigChunks(chunks, startIdx, endIdxExclusive, overlap, params, missingChunkSpacer);

// divide into chunks
int64_t totalChunks = endIdxExclusive - startIdx;
int64_t chunksPerThread = (int64_t) ceil(1.0 * totalChunks / numThreads);
if (chunksPerThread * (numThreads - 1) >= endIdxExclusive) numThreads--;
char **outputChunks = st_calloc(numThreads, sizeof(char*));

// multithread loop
#pragma omp parallel for schedule(static,1)
for (int64_t thread = 0; thread < numThreads; thread++) {
int64_t threadedStartIdx = startIdx + chunksPerThread * thread;
int64_t threadedEndIdxExclusive = threadedStartIdx + chunksPerThread;
if (endIdxExclusive < threadedEndIdxExclusive) threadedEndIdxExclusive = endIdxExclusive;

outputChunks[thread] = mergeContigChunks(chunks, threadedStartIdx, threadedEndIdxExclusive, overlap,
params, missingChunkSpacer);
}

// finish
char *contig = mergeContigChunks(outputChunks, 0, numThreads, overlap, params, missingChunkSpacer);
for (int64_t i = 0; i < numThreads; i++) {
free(outputChunks[i]);
}
free(outputChunks);
return contig;
}
4 changes: 4 additions & 0 deletions inc/margin.h
Original file line number Diff line number Diff line change
Expand Up @@ -1020,6 +1020,10 @@ void bamChunkRead_destruct(BamChunkRead *bamChunkRead);
*/
int64_t removeOverlap(char *prefixString, char *suffixString, int64_t approxOverlap, PolishParams *polishParams,
int64_t *prefixStringCropEnd, int64_t *suffixStringCropStart);
char *mergeContigChunksThreaded(char **chunks, int64_t startIdx, int64_t endIdxExclusive, int64_t numThreads,
int64_t overlap, Params *params, char *missingChunkSpacer);
char *mergeContigChunks(char **chunks, int64_t startIdx, int64_t endIdxExclusive,
int64_t overlap, Params *params, char *missingChunkSpacer);

/*
* View functions
Expand Down
Loading

0 comments on commit 3eeabfc

Please # to comment.