Skip to content

Commit

Permalink
lib: improve the PubMed downloader by adding log messages and fixing …
Browse files Browse the repository at this point in the history
…sonnar issues, #TASK-5775, #TASK-5564
  • Loading branch information
jtarraga committed Apr 25, 2024
1 parent 971235e commit cd444b0
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ public void execute() throws CellBaseException {
Thread.currentThread().interrupt();
throw new CellBaseException("Error executing command line 'download': " + e.getMessage(), e);
} catch (Exception e) {
e.printStackTrace();
throw new CellBaseException("Error executing command line 'download': " + e.getMessage(), e);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -371,9 +371,10 @@ public final class EtlCommons {
// PubMed
public static final String PUBMED_NAME = "PubMed";
public static final String PUBMED_DATA = "pubmed";
public static final String PUBMED_SUBDIRECTORY = PUBMED_DATA;
public static final String PUBMED_VERSION_FILENAME = PUBMED_DATA + SUFFIX_VERSION_FILENAME;
public static final String PUBMED_REGEX_FILE_ID = "PUBMED";
public static final String PUBMED_SUBDIRECTORY = "pubmed";
public static final String PUBMED_VERSION_FILENAME = "pubMed" + SUFFIX_VERSION_FILENAME;
// Must match the configuration file
public static final String PUBMED_REGEX_FILE_ID = "PUBMED_REGEX";

private EtlCommons() {
throw new IllegalStateException("Utility class");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import java.util.Collections;
import java.util.List;

import static org.opencb.cellbase.lib.EtlCommons.*;

public class PubMedDownloadManager extends AbstractDownloadManager {

public PubMedDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration)
Expand All @@ -36,13 +38,14 @@ public PubMedDownloadManager(String species, String assembly, Path targetDirecto

@Override
public List<DownloadFile> download() throws IOException, InterruptedException, CellBaseException {
Path pubmedFolder = downloadFolder.resolve(EtlCommons.PUBMED_SUBDIRECTORY);
logger.info(DOWNLOADING_LOG_MESSAGE, PUBMED_NAME);

Path pubmedFolder = downloadFolder.resolve(PUBMED_SUBDIRECTORY);
Files.createDirectories(pubmedFolder);
logger.info("Downloading {} files at {} ...", EtlCommons.PUBMED_DATA, pubmedFolder);

// Downloads PubMed XML files
String url = configuration.getDownload().getPubmed().getHost();
String regexp = configuration.getDownload().getPubmed().getFiles().get(EtlCommons.PUBMED_REGEX_FILE_ID);
String host = configuration.getDownload().getPubmed().getHost();
String regexp = configuration.getDownload().getPubmed().getFiles().get(PUBMED_REGEX_FILE_ID);
String[] name = regexp.split("[\\[\\]]");
String[] split = name[1].split("\\.\\.");
int start = Integer.parseInt(split[0]);
Expand All @@ -51,13 +54,18 @@ public List<DownloadFile> download() throws IOException, InterruptedException, C

List<DownloadFile> downloadFiles = new ArrayList<>();
for (int i = start; i <= end; i++) {
String filename = name[0] + String.format("%0" + padding + "d", i) + name[2];
logger.info("\tDownloading from {} to {} ", url + "/" + filename, pubmedFolder.resolve(filename));
downloadFiles.add(downloadFile(url + "/" + filename, pubmedFolder.resolve(filename).toString()));
String padString = "%0" + padding + "d";
String filename = name[0] + String.format(padString, i) + name[2];
String url = host + filename;
logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, pubmedFolder.resolve(filename));
downloadFiles.add(downloadFile(url, pubmedFolder.resolve(filename).toString()));
}

saveDataSource(EtlCommons.PUBMED_NAME, EtlCommons.PUBMED_DATA, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(),
Collections.singletonList(url), pubmedFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME));
// Save data source
saveDataSource(EtlCommons.PUBMED_NAME, PUBMED_DATA, configuration.getDownload().getPubmed().getVersion(), getTimeStamp(),
Collections.singletonList(host), pubmedFolder.resolve(PUBMED_VERSION_FILENAME));

logger.info(DOWNLOADING_DONE_LOG_MESSAGE, PUBMED_NAME);

return downloadFiles;
}
Expand Down

0 comments on commit cd444b0

Please # to comment.