Skip to content

Commit

Permalink
lib: update conservation builder by removing the hardcoded filenames …
Browse files Browse the repository at this point in the history
…and taking them from the version files (i.e., URLs of the DataSource), improve log/exception messages, and fix sonnar issues, #TASK-5564
  • Loading branch information
jtarraga committed Apr 22, 2024
1 parent 148814f commit 30a4c87
Show file tree
Hide file tree
Showing 5 changed files with 245 additions and 265 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,7 @@ public void execute() throws CellBaseException {
}

if (parser != null) {
logger.info(CellBaseBuilder.BUILDING_LOG_MESSAGE, data);
parser.parse();
logger.info(CellBaseBuilder.BUILDING_DONE_LOG_MESSAGE, data);
parser.disconnect();
}
}
Expand Down Expand Up @@ -285,14 +283,16 @@ private CellBaseBuilder buildProtein() {
.resolve("protein2ipr.dat.gz"), speciesConfiguration.getScientificName(), serializer);
}

private CellBaseBuilder buildConservation() {
Path conservationFilesDir = downloadFolder.resolve("conservation");
copyVersionFiles(Arrays.asList(conservationFilesDir.resolve("gerpVersion.json"),
conservationFilesDir.resolve("phastConsVersion.json"),
conservationFilesDir.resolve("phyloPVersion.json")));
private CellBaseBuilder buildConservation() throws CellBaseException {
// Sanity check
Path conservationDownloadPath = downloadFolder.resolve(CONSERVATION_SUBDIRECTORY);
copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(GERP_VERSION_FILENAME),
conservationDownloadPath.resolve(PHASTCONS_VERSION_FILENAME), conservationDownloadPath.resolve(PHYLOP_VERSION_FILENAME)),
buildFolder.resolve(CONSERVATION_SUBDIRECTORY));

int conservationChunkSize = MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE;
CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder);
return new ConservationBuilder(conservationFilesDir, conservationChunkSize, serializer);
CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(CONSERVATION_SUBDIRECTORY));
return new ConservationBuilder(conservationDownloadPath, conservationChunkSize, serializer);
}

private CellBaseBuilder buildClinicalVariants() throws CellBaseException {
Expand Down
14 changes: 10 additions & 4 deletions cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java
Original file line number Diff line number Diff line change
Expand Up @@ -333,13 +333,15 @@ public class EtlCommons {
public static final String GERP_FILE_ID = "GERP";
// PHASTCONS
public static final String PHASTCONS_NAME = "PhastCons";
public static final String PHASTCONS_SUBDIRECTORY = "phastCons";
public static final String PHASTCONS_VERSION_FILENAME = "phastCons" + SUFFIX_VERSION_FILENAME;
public static final String PHASTCONS_DATA = "phastCons";
public static final String PHASTCONS_SUBDIRECTORY = PHASTCONS_DATA;
public static final String PHASTCONS_VERSION_FILENAME = PHASTCONS_DATA + SUFFIX_VERSION_FILENAME;
public static final String PHASTCONS_FILE_ID = "PHASTCONS";
// PHYLOP
public static final String PHYLOP_NAME = "PhyloP";
public static final String PHYLOP_SUBDIRECTORY = "phylop";
public static final String PHYLOP_VERSION_FILENAME = "phylop" + SUFFIX_VERSION_FILENAME;
public static final String PHYLOP_DATA = "phylop";
public static final String PHYLOP_SUBDIRECTORY = PHYLOP_DATA;
public static final String PHYLOP_VERSION_FILENAME = PHYLOP_DATA + SUFFIX_VERSION_FILENAME;
public static final String PHYLOP_FILE_ID = "PHYLOP";

// Splice scores
Expand Down Expand Up @@ -502,4 +504,8 @@ public static String getUrl(DownloadProperties.URLProperties props, String fileI
}
return url;
}

public static String getFilename(String prefix, String chromosome) {
return prefix + "_" + chromosome;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,19 @@

package org.opencb.cellbase.lib.builders;

import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.core.models.DataSource;
import org.opencb.cellbase.core.serializer.CellBaseSerializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

/**
* Created by imedina on 30/08/14.
*/
Expand All @@ -30,7 +39,10 @@ public abstract class CellBaseBuilder {
protected Logger logger;

public static final String BUILDING_LOG_MESSAGE = "Building {} ...";
public static final String BUILDING_DONE_LOG_MESSAGE = "Building {} done!";
public static final String BUILDING_DONE_LOG_MESSAGE = "Building {} done.";

public static final String PARSING_LOG_MESSAGE = "Parsing file {} ...";
public static final String PARSING_DONE_LOG_MESSAGE = "Parsing file {} done.";


public CellBaseBuilder(CellBaseSerializer serializer) {
Expand All @@ -50,4 +62,24 @@ public void disconnect() {
}
}

protected List<File> checkFiles(DataSource dataSource, Path targetPath, String name) throws CellBaseException {
logger.info("Checking {} folder and files", name);
if (!targetPath.toFile().exists()) {
throw new CellBaseException(name + " folder does not exist " + targetPath);
}

List<File> files = new ArrayList<>();

List<String> filenames = dataSource.getUrls().stream().map(u -> Paths.get(u).getFileName().toString()).collect(Collectors.toList());
for (String filename : filenames) {
File file = targetPath.resolve(filename).toFile();
if (!file.exists()) {
throw new CellBaseException("File " + file + " does not exits");
} else {
files.add(file);
}
}

return files;
}
}
Loading

0 comments on commit 30a4c87

Please # to comment.