Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Record makeblastdb errors for debugging purposes #683

Merged
merged 2 commits into from
Sep 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 33 additions & 16 deletions lib/sequenceserver/makeblastdb.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,7 @@ def initialize(database_dir)
@database_dir = database_dir
end

attr_reader :database_dir
attr_reader :formatted_fastas
attr_reader :fastas_to_format
attr_reader :fastas_to_reformat
attr_reader :database_dir, :formatted_fastas, :fastas_to_format, :fastas_to_reformat

# Scans the database directory to determine which FASTA files require
# formatting or re-formatting.
Expand Down Expand Up @@ -64,6 +61,7 @@ def any_unformatted?
def any_incompatible?
return false if @formatted_fastas.all? { |ff| ff.v4? || ff.alias? }
return false if @formatted_fastas.all? { |ff| ff.v5? || ff.alias? }

true
end

Expand All @@ -81,6 +79,7 @@ def format
# Make the intent clear as well as ensure the program won't crash if we
# accidentally call format before calling scan.
return unless @fastas_to_format

@fastas_to_format.select do |path, title, type|
make_blast_database('format', path, title, type)
end
Expand All @@ -92,6 +91,7 @@ def reformat
# Make the intent clear as well as ensure the program won't crash if
# we accidentally call reformat before calling scan.
return unless @fastas_to_reformat

@fastas_to_reformat.select do |path, title, type, non_parse_seqids|
make_blast_database('reformat', path, title, type, non_parse_seqids)
end
Expand All @@ -105,6 +105,7 @@ def determine_formatted_fastas
blastdbcmd.each_line do |line|
path, *rest = line.chomp.split("\t")
next if multipart_database_name?(path)

rest << get_categories(path)
@formatted_fastas << Database.new(path, *rest)
end
Expand All @@ -114,9 +115,7 @@ def determine_formatted_fastas
# reformatting. Adds to @fastas_to_format.
def determine_fastas_to_reformat
@formatted_fastas.each do |ff|
if ff.v4? || ff.non_parse_seqids?
@fastas_to_reformat << [ff.path, ff.title, ff.type, ff.non_parse_seqids?]
end
@fastas_to_reformat << [ff.path, ff.title, ff.type, ff.non_parse_seqids?] if ff.v4? || ff.non_parse_seqids?
end
end

Expand All @@ -132,8 +131,8 @@ def determine_unformatted_fastas
next if @formatted_fastas.any? { |f| f[0] == path }

@fastas_to_format << [path,
make_db_title(path),
guess_sequence_type_in_fasta(path)]
make_db_title(path),
guess_sequence_type_in_fasta(path)]
end
end

Expand All @@ -146,14 +145,16 @@ def blastdbcmd
out, err = sys(cmd, path: config[:bin])
errpat = /BLAST Database error/
fail BLAST_DATABASE_ERROR.new(cmd, err) if err.match(errpat)
return out

out
rescue CommandFailed => e
fail BLAST_DATABASE_ERROR.new(cmd, e.stderr)
raise BLAST_DATABASE_ERROR.new(cmd, e.stderr)
end

# Create BLAST database, given FASTA file and sequence type in FASTA file.
def make_blast_database(action, file, title, type, non_parse_seqids = false)
return unless make_blast_database?(action, file, type)

title = confirm_database_title(title)
extract_fasta(file) unless File.exist?(file)
taxonomy = taxid_map(file, non_parse_seqids) || taxid
Expand Down Expand Up @@ -188,9 +189,10 @@ def confirm_database_title(default)
# using blastdbcmd.
def taxid_map(db, non_parse_seqids)
return if non_parse_seqids

taxid_map = db.sub(/#{File.extname(db)}$/, '.taxid_map.txt')
extract_taxid_map(db, taxid_map) if !File.exist?(taxid_map)
"-taxid_map #{taxid_map}" if !File.zero?(taxid_map)
extract_taxid_map(db, taxid_map) unless File.exist?(taxid_map)
"-taxid_map #{taxid_map}" unless File.zero?(taxid_map)
end

# Get taxid from the user. Returns user input or 0.
Expand All @@ -211,10 +213,24 @@ def _make_blast_database(file, type, title, taxonomy)
cmd = "makeblastdb -parse_seqids -hash_index -in '#{file}'" \
" -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
" #{taxonomy}"
out, err = sys(cmd, path: config[:bin])

output = if File.directory?(file)
File.join(file, 'makeblastdb')
else
"#{file}.makeblastdb"
end

out, err = sys(
cmd,
path: config[:bin],
stderr: [output, 'stderr'].join,
stdout: [output, 'stdout'].join
)

puts out.strip
puts err.strip
return true

true
rescue CommandFailed => e
puts <<~MSG
Could not create BLAST database for: #{file}
Expand Down Expand Up @@ -261,7 +277,7 @@ def extract_taxid_map(db, taxmap_file)
# /home/ben/pd.ben/sequenceserver/db/nr00 => no
# /mnt/blast-db/refseq_genomic.100 => yes
def multipart_database_name?(db_name)
!(db_name.match(%r{.+/\S+\.\d{2,3}$}).nil?)
!db_name.match(%r{.+/\S+\.\d{2,3}$}).nil?
end

def get_categories(path)
Expand All @@ -274,6 +290,7 @@ def get_categories(path)
# Returns true if first character of the file is '>'.
def probably_fasta?(file)
return false unless file.match(/((cds)|(fasta)|(fna)|(pep)|(cdna)|(fa)|(prot)|(fas)|(genome)|(nuc)|(dna)|(nt))$/i)

File.read(file, 1) == '>'
end

Expand Down
2 changes: 1 addition & 1 deletion lib/sequenceserver/sys.rb
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def self.sys(command, options = {})

# Now move the temporary file to the given path.
# TODO: don't we need to explicitly close the temp file here?
FileUtils.mv(temp_files.delete(channel), filename)
FileUtils.cp(temp_files[channel], filename)
end

# Read the remaining temp files into memory. For large outputs,
Expand Down
4 changes: 4 additions & 0 deletions spec/database/invalid/duplicate_ids.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>ApisMellifera_SequenceA
MLPQTPDIIPTTLNQQKCVKARALYDNIAEAPDELAFRKGD
>ApisMellifera_SequenceA
MLPQTPDIIPTTLNQQKCVKARALYDNIAEAPDELAFRKGD
85 changes: 53 additions & 32 deletions spec/makeblastdb_spec.rb
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
require 'spec_helper'
require 'sequenceserver/database'

# Test Database class.
module SequenceServer
describe 'Database' do
describe 'makeblastdb' do
let 'root' do
__dir__
end

let(:disposable_database_dir) { File.join(root, 'tmp', 'databases') }

let 'makeblastdb' do
SequenceServer.makeblastdb
end
Expand Down Expand Up @@ -55,16 +56,6 @@ module SequenceServer
'Sinvicta2-2-3.prot.subset.fasta.phr')
end

let 'data_for_makeblastdb' do
[
File.join(database_dir_unformatted, 'Cardiocondyla_obscurior',
'Cobs1.4.proteins.fa'),
:protein,
'Cobs 1.4 proteins',
true
]
end

let 'makeblastdb_result_pattern' do
File.join(database_dir_unformatted, 'Cardiocondyla_obscurior',
'Cobs1.4.proteins.fa.*')
Expand All @@ -75,64 +66,94 @@ module SequenceServer
end

it 'can tell FASTA file' do
makeblastdb.send(:probably_fasta?, text_file).should be_falsey
makeblastdb.send(:probably_fasta?, binary_file).should be_falsey
makeblastdb.send(:probably_fasta?, fasta_file_prot_seqs).should be_truthy
makeblastdb.send(:probably_fasta?, fasta_file_nucl_seqs).should be_truthy
expect(makeblastdb.send(:probably_fasta?, text_file)).to be_falsey
expect(makeblastdb.send(:probably_fasta?, binary_file)).to be_falsey
expect(makeblastdb.send(:probably_fasta?, fasta_file_prot_seqs)).to be_truthy
expect(makeblastdb.send(:probably_fasta?, fasta_file_nucl_seqs)).to be_truthy
end

it 'can tell type of sequences in FASTA file' do
makeblastdb.send(:guess_sequence_type_in_fasta, fasta_file_prot_seqs).should eq :protein
makeblastdb.send(:guess_sequence_type_in_fasta, fasta_file_nucl_seqs).should eq :nucleotide
expect(makeblastdb.send(:guess_sequence_type_in_fasta, fasta_file_prot_seqs)).to eq :protein
expect(makeblastdb.send(:guess_sequence_type_in_fasta, fasta_file_nucl_seqs)).to eq :nucleotide
end

it 'can tell FASTA files that are yet to be made into a BLAST+ database' do
makeblastdb.instance_variable_set(:@database_dir, database_dir_unformatted)
makeblastdb.scan.should be_truthy
expect(makeblastdb.scan).to be_truthy
end

it 'can tell databases that require reformatting' do
# Control: shouldn't report sample v5 databases as requiring reformatting.
makeblastdb.instance_variable_set(:@database_dir, database_dir_v5)
makeblastdb.scan.should be_falsey
expect(makeblastdb.scan).to be_falsey

# Databases created using blastdb_aliastool don't require reformatting either.
makeblastdb.instance_variable_set(:@database_dir, database_dir_blastdb_aliastool)
makeblastdb.scan.should be_falsey
expect(makeblastdb.scan).to be_falsey

# Databases created without -parse_seqids option don't require reformatting either.
# We disable 'sequence download' link instead.
makeblastdb.instance_variable_set(:@database_dir, database_dir_without_parse_seqids)
makeblastdb.scan.should be_falsey
expect(makeblastdb.scan).to be_falsey

# v4 databases require reformatting.
makeblastdb.instance_variable_set(:@database_dir, database_dir_v4)
makeblastdb.scan.should be_truthy
expect(makeblastdb.scan).to be_truthy
end

# it 'can make BLAST+ database from a FASTA file' do
# Database._make_blast_database(*data_for_makeblastdb).should be_truthy
# system "rm #{makeblastdb_result_pattern}"
# end

it 'can make intelligent database name suggestions' do
db_name_pairs = [['Si_gnf.fasta', 'Si gnf'],
['Aech.3.8.cds.fasta', 'Aech 3.8 cds'],
['Cobs1.4.proteins.fasta', 'Cobs 1.4 proteins'],
['S_inv.x.small.2.5.nucl.fa', 'S inv x small 2.5 nucl'],
['Sinvicta2-2-3.prot.fasta', 'Sinvicta 2-2-3 prot']]
db_name_pairs.each do |db|
makeblastdb.send(:make_db_title, db[0]).should eql(db[1])
expect(makeblastdb.send(:make_db_title, db[0])).to eql(db[1])
end
end

it 'can tell NCBI multipart database name' do
sample_name1 = '/home/ben/pd.ben/sequenceserver/db/nr'
sample_name2 = '/home/ben/pd.ben/sequenceserver/db/nr.00'
sample_name3 = '/home/ben/pd.ben/sequenceserver/db/img3.5.finished.faa.01'
makeblastdb.send(:multipart_database_name?, sample_name1).should be_falsey
makeblastdb.send(:multipart_database_name?, sample_name2).should be_truthy
makeblastdb.send(:multipart_database_name?, sample_name3).should be_truthy
expect(makeblastdb.send(:multipart_database_name?, sample_name1)).to be_falsey
expect(makeblastdb.send(:multipart_database_name?, sample_name2)).to be_truthy
expect(makeblastdb.send(:multipart_database_name?, sample_name3)).to be_truthy
end

describe '#make_blast_database' do
context 'duplicate sequence ids' do
before do
FileUtils.rm_rf(disposable_database_dir)
allow($stdin).to receive(:gets).exactly(3).times.and_return("\n") # Accept default option prompted by the CLI
allow_any_instance_of(Object).to receive(:exit!).and_return(nil) # Prevents the CLI from killing Rspec process
FileUtils.mkdir_p(disposable_database_dir)
FileUtils.cp_r(File.join(database_dir, 'invalid', 'duplicate_ids.fasta'), disposable_database_dir)
end

after do
FileUtils.rm_rf(disposable_database_dir)
end

let(:duplicated_id_database) { File.join(disposable_database_dir, 'duplicate_ids.fasta') }

it 'it records errors in a file' do
makeblastdb = SequenceServer::MAKEBLASTDB.new(disposable_database_dir)
makeblastdb.scan
makeblastdb.format

expect(File.read("#{duplicated_id_database}.makeblastdbstderr")).to match(/Duplicate seq_ids are found/)
end

it 'it prints errors to stdout' do
allow($stdout).to receive(:puts).and_call_original
makeblastdb = SequenceServer::MAKEBLASTDB.new(disposable_database_dir)
makeblastdb.scan
makeblastdb.format

expect($stdout).to have_received(:puts).with(/Duplicate seq_ids are found/)
end
end
end
end
end
Loading