34
34
import org .opencb .cellbase .lib .builders .*;
35
35
import org .opencb .cellbase .lib .builders .clinical .variant .ClinicalVariantBuilder ;
36
36
37
- import java .io .File ;
38
37
import java .io .IOException ;
39
38
import java .nio .file .Files ;
40
39
import java .nio .file .Path ;
@@ -60,11 +59,8 @@ public class BuildCommandExecutor extends CommandExecutor {
60
59
private boolean normalize = true ;
61
60
62
61
private SpeciesConfiguration .Assembly assembly ;
63
- private String ensemblVersion ;
64
62
private String ensemblRelease ;
65
63
66
- private File ensemblScriptsFolder ;
67
-
68
64
private boolean flexibleGTFParsing ;
69
65
private SpeciesConfiguration speciesConfiguration ;
70
66
@@ -75,15 +71,16 @@ public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildComma
75
71
this .output = Paths .get (buildCommandOptions .outputDirectory );
76
72
normalize = !buildCommandOptions .skipNormalize ;
77
73
78
- this .ensemblScriptsFolder = new File (System .getProperty ("basedir" ) + "/bin/ensembl-scripts/" );
79
74
this .flexibleGTFParsing = buildCommandOptions .flexibleGTFParsing ;
80
75
}
81
76
82
-
83
77
/**
84
78
* Parse specific 'build' command options.
79
+ *
80
+ * @throws CellBaseException Exception
85
81
*/
86
- public void execute () {
82
+ public void execute () throws CellBaseException {
83
+ String buildOption = null ;
87
84
try {
88
85
// Output directory need to be created if it doesn't exist
89
86
if (!Files .exists (output )) {
@@ -104,7 +101,7 @@ public void execute() {
104
101
assembly = SpeciesUtils .getDefaultAssembly (speciesConfiguration );
105
102
}
106
103
107
- ensemblVersion = assembly .getEnsemblVersion ();
104
+ String ensemblVersion = assembly .getEnsemblVersion ();
108
105
ensemblRelease = "release-" + ensemblVersion .split ("_" )[0 ];
109
106
110
107
String spShortName = getSpeciesShortname (speciesConfiguration );
@@ -130,9 +127,8 @@ public void execute() {
130
127
}
131
128
132
129
for (int i = 0 ; i < buildOptions .length ; i ++) {
133
- String buildOption = buildOptions [i ];
130
+ buildOption = buildOptions [i ];
134
131
135
- logger .info ("Building '{}' data" , buildOption );
136
132
CellBaseBuilder parser = null ;
137
133
switch (buildOption ) {
138
134
case EtlCommons .GENOME_DATA :
@@ -156,9 +152,6 @@ public void execute() {
156
152
case EtlCommons .PROTEIN_DATA :
157
153
parser = buildProtein ();
158
154
break ;
159
- // case EtlCommons.PPI_DATA:
160
- // parser = getInteractionParser();
161
- // break;
162
155
case EtlCommons .CONSERVATION_DATA :
163
156
parser = buildConservation ();
164
157
break ;
@@ -181,24 +174,26 @@ public void execute() {
181
174
parser = buildPharmacogenomics ();
182
175
break ;
183
176
default :
184
- logger .error ("Build option '" + buildCommandOptions . data + " ' is not valid" );
177
+ logger .error ("Build option '{} ' is not valid" , buildCommandOptions . data );
185
178
break ;
186
179
}
187
180
188
181
if (parser != null ) {
189
- try {
190
- parser .parse ();
191
- } catch (Exception e ) {
192
- logger .error ("Error executing 'build' command " + buildCommandOptions .data + ": " + e .getMessage (), e );
193
- }
182
+ logger .info ("Building '{}' data ..." , buildOption );
183
+ parser .parse ();
184
+ logger .info ("Building '{}' data. Done." , buildOption );
194
185
parser .disconnect ();
195
186
}
196
187
}
197
188
}
198
189
} catch (ParameterException e ) {
199
190
logger .error ("Error parsing build command line parameters: " + e .getMessage (), e );
200
- } catch (IOException | CellBaseException e ) {
201
- logger .error (e .getMessage ());
191
+ } catch (Exception e ) {
192
+ String msg = "Error executing the command 'build'." ;
193
+ if (StringUtils .isNotEmpty (buildOption )) {
194
+ msg += " It was building the data '" + buildOption + "'" ;
195
+ }
196
+ throw new CellBaseException (msg , e );
202
197
}
203
198
}
204
199
@@ -207,7 +202,6 @@ private CellBaseBuilder buildRepeats() {
207
202
copyVersionFiles (Arrays .asList (repeatsFilesDir .resolve (EtlCommons .TRF_VERSION_FILENAME )));
208
203
copyVersionFiles (Arrays .asList (repeatsFilesDir .resolve (EtlCommons .GSD_VERSION_FILENAME )));
209
204
copyVersionFiles (Arrays .asList (repeatsFilesDir .resolve (EtlCommons .WM_VERSION_FILENAME )));
210
- // TODO: chunk size is not really used in ConvervedRegionParser, remove?
211
205
CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer (buildFolder , EtlCommons .REPEATS_JSON );
212
206
return new RepeatsBuilder (repeatsFilesDir , serializer );
213
207
}
@@ -223,44 +217,11 @@ private void copyVersionFiles(List<Path> pathList) {
223
217
try {
224
218
Files .copy (path , downloadFolder .resolve (path .getFileName ()), StandardCopyOption .REPLACE_EXISTING );
225
219
} catch (IOException e ) {
226
- logger .warn ("Version file {} not found - skipping" , path . toString () );
220
+ logger .warn ("Version file {} not found - skipping" , path );
227
221
}
228
222
}
229
223
}
230
224
231
- // private void buildGenomeInfo() {
232
- // /**
233
- // * To get some extra info about the genome such as chromosome length or cytobands
234
- // * we execute the following script.
235
- // */
236
- // try {
237
- // String outputFileName = downloadFolder.resolve("genome_info.json").toAbsolutePath().toString();
238
- // List<String> args = new ArrayList<>();
239
- // args.addAll(Arrays.asList("--species", speciesConfigurathtion.getScientificName(),
240
- // "--assembly", buildCommandOptions.assembly == null ? getDefaultHumanAssembly() : buildCommandOptions.assembly,
241
- // "-o", outputFileName,
242
- // "--ensembl-libs", configuration.getDownload().getEnsembl().getLibs()));
243
- // if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration)
244
- // && !speciesConfiguration.getScientificName().equals("Drosophila melanogaster")) {
245
- // args.add("--phylo");
246
- // args.add("no-vertebrate");
247
- // }
248
- //
249
- // String geneInfoLogFileName = downloadFolder.resolve("genome_info.log").toAbsolutePath().toString();
250
- //
251
- // boolean downloadedGenomeInfo;
252
- // downloadedGenomeInfo = EtlCommons.runCommandLineProcess(ensemblScriptsFolder, "./genome_info.pl", args, geneInfoLogFileName);
253
- //
254
- // if (downloadedGenomeInfo) {
255
- // logger.info(outputFileName + " created OK");
256
- // } else {
257
- // logger.error("Genome info for " + speciesConfiguration.getScientificName() + " cannot be downloaded");
258
- // }
259
- // } catch (IOException | InterruptedException e) {
260
- // e.printStackTrace();
261
- // }
262
- // }
263
-
264
225
private CellBaseBuilder buildGenomeSequence () throws CellBaseException {
265
226
// Sanity check
266
227
Path genomeVersionPath = downloadFolder .resolve (GENOME_SUBDIRECTORY ).resolve (GENOME_VERSION_FILENAME );
@@ -316,50 +277,19 @@ private CellBaseBuilder buildRegulation() {
316
277
}
317
278
318
279
private CellBaseBuilder buildProtein () {
319
- Path proteinFolder = downloadFolder .resolve ("protein" );
280
+ Path proteinFolder = downloadFolder .resolve (PROTEIN_SUBDIRECTORY );
320
281
copyVersionFiles (Arrays .asList (proteinFolder .resolve ("uniprotVersion.json" ),
321
282
proteinFolder .resolve ("interproVersion.json" )));
322
- CellBaseSerializer serializer = new CellBaseJsonFileSerializer (buildFolder , "protein" );
323
- return new ProteinBuilder (proteinFolder .resolve ("uniprot_chunks" ),
324
- downloadFolder .resolve ("protein" ).resolve ("protein2ipr.dat.gz" ), speciesConfiguration .getScientificName (), serializer );
325
- }
326
-
327
- private void getProteinFunctionPredictionMatrices (SpeciesConfiguration sp , Path geneFolder )
328
- throws IOException , InterruptedException {
329
- logger .info ("Downloading protein function prediction matrices ..." );
330
-
331
- // run protein_function_prediction_matrices.pl
332
- String proteinFunctionProcessLogFile = geneFolder .resolve ("protein_function_prediction_matrices.log" ).toString ();
333
- List <String > args = Arrays .asList ("--species" , sp .getScientificName (), "--outdir" , geneFolder .toString (),
334
- "--ensembl-libs" , configuration .getDownload ().getEnsembl ().getLibs ());
335
-
336
- boolean proteinFunctionPredictionMatricesObtaines = EtlCommons .runCommandLineProcess (ensemblScriptsFolder ,
337
- "./protein_function_prediction_matrices.pl" ,
338
- args ,
339
- proteinFunctionProcessLogFile );
340
-
341
- // check output
342
- if (proteinFunctionPredictionMatricesObtaines ) {
343
- logger .info ("Protein function prediction matrices created OK" );
344
- } else {
345
- logger .error ("Protein function prediction matrices for " + sp .getScientificName () + " cannot be downloaded" );
346
- }
347
- }
348
-
349
- private CellBaseBuilder getInteractionParser () {
350
- Path proteinFolder = downloadFolder .resolve ("protein" );
351
- Path psimiTabFile = proteinFolder .resolve ("intact.txt" );
352
- copyVersionFiles (Arrays .asList (proteinFolder .resolve ("intactVersion.json" )));
353
- CellBaseSerializer serializer = new CellBaseJsonFileSerializer (buildFolder , "protein_protein_interaction" );
354
- return new InteractionBuilder (psimiTabFile , speciesConfiguration .getScientificName (), serializer );
283
+ CellBaseSerializer serializer = new CellBaseJsonFileSerializer (buildFolder , PROTEIN_DATA );
284
+ return new ProteinBuilder (proteinFolder .resolve ("uniprot_chunks" ), downloadFolder .resolve (PROTEIN_SUBDIRECTORY )
285
+ .resolve ("protein2ipr.dat.gz" ), speciesConfiguration .getScientificName (), serializer );
355
286
}
356
287
357
288
private CellBaseBuilder buildConservation () {
358
289
Path conservationFilesDir = downloadFolder .resolve ("conservation" );
359
290
copyVersionFiles (Arrays .asList (conservationFilesDir .resolve ("gerpVersion.json" ),
360
291
conservationFilesDir .resolve ("phastConsVersion.json" ),
361
292
conservationFilesDir .resolve ("phyloPVersion.json" )));
362
- // TODO: chunk size is not really used in ConvervedRegionParser, remove?
363
293
int conservationChunkSize = MongoDBCollectionConfiguration .CONSERVATION_CHUNK_SIZE ;
364
294
CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer (buildFolder );
365
295
return new ConservationBuilder (conservationFilesDir , conservationChunkSize , serializer );
@@ -406,10 +336,14 @@ private Path getFastaReferenceGenome() throws CellBaseException {
406
336
Path fastaPath = downloadFolder .resolve (GENOME_SUBDIRECTORY ).resolve (fastaFilename );
407
337
if (fastaPath .toFile ().exists ()) {
408
338
// Gunzip
409
- logger .info ("Gunzip file: " + fastaPath );
339
+ logger .info ("Gunzip file: {}" , fastaPath );
410
340
try {
411
341
EtlCommons .runCommandLineProcess (null , "gunzip" , Collections .singletonList (fastaPath .toString ()), null );
412
- } catch (IOException | InterruptedException e ) {
342
+ } catch (IOException e ) {
343
+ throw new CellBaseException ("Error executing gunzip in FASTA file " + fastaPath , e );
344
+ } catch (InterruptedException e ) {
345
+ // Restore interrupted state...
346
+ Thread .currentThread ().interrupt ();
413
347
throw new CellBaseException ("Error executing gunzip in FASTA file " + fastaPath , e );
414
348
}
415
349
}
0 commit comments