Extract top blast hits from multiple blast output files

BLAST results files must be in tabular format (e.g., outfmt 6). For each BLAST results file, the single top hit (sorted by ascending evalue, then descending bitscore) will be extracted from the BLAST database and written to `out_dir`.

extract_blast_hits(blast_results_dir, blast_results_pattern,
  blast_cols = c("qseqid", "sseqid", "pident", "length", "mismatch",
  "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"),
  database_path, out_dir, out_ext = "bestmatch.fasta", ...)

Arguments

blast_results_dir	Path to folder containing BLAST results files.
blast_results_pattern	Optional; pattern used for matching with grep. Only files with names matching the pattern will be used to extract top blast hits.
blast_cols	Character vector; column names of BLAST results. See https://www.ncbi.nlm.nih.gov/books/NBK279684/ (Table C1) for details. Must include at least 'sseqid', 'evalue', and 'bitscore'. Defaults to standard columns for output format 6.
database_path	Path to the BLAST database, including the database name.
out_dir	Path to folder to write results.
out_ext	File extension used when writing out top BLAST hit.
...	Additional other arguments. Not used by this function, but meant to be used by `drake_plan` for tracking during workflows.

Value

TRUE when runs successfully; externally, the top blast hit for each fasta result will be written to `out_dir`.

Examples

library(ape)

# Make temp dir for storing files
temp_dir <- fs::dir_create(fs::path(tempdir(), "baitfindR_example"))

# Write out ape::woodmouse dataset as DNA
data(woodmouse)
ape::write.FASTA(woodmouse, fs::path(temp_dir, "woodmouse.fasta"))
ape::write.FASTA(woodmouse, fs::path(temp_dir, "woodmouse2.fasta"))

# Make blast database
build_blast_db(
  fs::path(temp_dir, "woodmouse.fasta"),
  db_type = "nucl",
  out_name = "wood",
  parse_seqids = TRUE,
  wd = temp_dir)
#> 
#> 
#> Building a new DB, current time: 05/15/2019 16:40:43
#> New DB name:   /tmp/RtmpeNC9nF/baitfindR_example/wood
#> New DB title:  /tmp/RtmpeNC9nF/baitfindR_example/woodmouse.fasta
#> Sequence type: Nucleotide
#> Keep MBits: T
#> Maximum file size: 1000000000B
#> Adding sequences from FASTA; added 15 sequences in 0.0116849 seconds.
#> $status
#> [1] 0
#> 
#> $stdout
#> [1] "\n\nBuilding a new DB, current time: 05/15/2019 16:40:43\nNew DB name:   /tmp/RtmpeNC9nF/baitfindR_example/wood\nNew DB title:  /tmp/RtmpeNC9nF/baitfindR_example/woodmouse.fasta\nSequence type: Nucleotide\nKeep MBits: T\nMaximum file size: 1000000000B\nAdding sequences from FASTA; added 15 sequences in 0.0116849 seconds.\n"
#> 
#> $stderr
#> [1] ""
#> 
#> $timeout
#> [1] FALSE
#> 

# Blast the original sequences against the database
blast_n_list(
  fasta_folder = temp_dir,
  fasta_pattern = "fasta",
  database_path = fs::path(temp_dir, "wood")
)
#> [1] "c90963e4b507281024a89cbb54d8074f"

# Extract the top BLAST hit for each fasta file.
extract_blast_hits(
  blast_results_dir = temp_dir,
  blast_results_pattern = "\\.tsv$",
  database_path = fs::path(temp_dir, "wood"),
  out_dir = temp_dir
)
#> Parsed with column specification:
#> cols(
#>   qseqid = col_character(),
#>   sseqid = col_character(),
#>   pident = col_double(),
#>   length = col_double(),
#>   mismatch = col_double(),
#>   gapopen = col_double(),
#>   qstart = col_double(),
#>   qend = col_double(),
#>   sstart = col_double(),
#>   send = col_double(),
#>   evalue = col_double(),
#>   bitscore = col_double()
#> )
#> Parsed with column specification:
#> cols(
#>   qseqid = col_character(),
#>   sseqid = col_character(),
#>   pident = col_double(),
#>   length = col_double(),
#>   mismatch = col_double(),
#>   gapopen = col_double(),
#>   qstart = col_double(),
#>   qend = col_double(),
#>   sstart = col_double(),
#>   send = col_double(),
#>   evalue = col_double(),
#>   bitscore = col_double()
#> )
#> [1] TRUE

list.files(temp_dir)
#>  [1] "wood.nhr"                       "wood.nin"                      
#>  [3] "wood.nog"                       "wood.nsd"                      
#>  [5] "wood.nsi"                       "wood.nsq"                      
#>  [7] "woodmouse.fasta"                "woodmouse.tsv"                 
#>  [9] "woodmouse.tsv.bestmatch.fasta"  "woodmouse2.fasta"              
#> [11] "woodmouse2.tsv"                 "woodmouse2.tsv.bestmatch.fasta"
ape::read.FASTA(fs::path(temp_dir, "woodmouse.tsv.bestmatch.fasta"))
#> 1 DNA sequence in binary format stored in a list.
#> 
#> Sequence length: 965 
#> 
#> Label:
#> No306 
#> 
#> Base composition:
#>     a     c     g     t 
#> 0.309 0.258 0.125 0.308 
#> (Total: 965 bases)

# Cleanup.
fs::file_delete(temp_dir)

Arguments

Value

Examples

Contents