Run blastn on all fasta files in a folder.

Output is written to the same folder containing the input files.

blast_n_list(fasta_folder, fasta_pattern, database_path, out_ext = "tsv",
  outfmt = "6", other_args = NULL, overwrite = FALSE, echo = FALSE,
  get_hash = TRUE, ...)

Arguments

fasta_folder	Path to the folder containing fasta files to BLAST.
fasta_pattern	Optional; pattern used for matching with grep. Only files with names matching the pattern will be included in the BLAST search.
database_path	Path to the BLAST database, including the database name.
out_ext	File extension used for BLAST results files. The result of each BLAST search will be a file with the same name as the input fasta files, but with this extension appended.
outfmt	String; format to use for BLAST output. See https://www.ncbi.nlm.nih.gov/books/NBK279684/ (Table C1) for details.
other_args	Character vector; other arguments to pass on to `blastn`. For a list of options, run `blastn -help`.
overwrite	Logical: should old output be erased before running this function? "Old output" will be determined by matching any file names with `out_ext`.
echo	Logical; should standard error and output be printed?
get_hash	Logical; if TRUE, the MD5 hash of the output will be returned.
...	Additional other arguments. Not used by this function, but meant to be used by `drake_plan` for tracking during workflows.

Value

NULL or character vector if `get_hash` is TRUE. Externally, a text file file with the results of the blastn search, named by adding `out_ext` to each input fasta file name.

References

https://www.ncbi.nlm.nih.gov/books/NBK279690/

Examples

library(ape)

# Make temp dir for storing files
temp_dir <- fs::dir_create(fs::path(tempdir(), "baitfindR_example"))

# Write out ape::woodmouse dataset as DNA
data(woodmouse)
ape::write.FASTA(woodmouse, fs::path(temp_dir, "woodmouse.fasta"))
ape::write.FASTA(woodmouse, fs::path(temp_dir, "woodmouse2.fasta"))

# Make blast database
build_blast_db(
  fs::path(temp_dir, "woodmouse.fasta"),
  db_type = "nucl",
  out_name = "wood",
  parse_seqids = TRUE,
  wd = temp_dir)
#> 
#> 
#> Building a new DB, current time: 05/15/2019 16:40:37
#> New DB name:   /tmp/RtmpeNC9nF/baitfindR_example/wood
#> New DB title:  /tmp/RtmpeNC9nF/baitfindR_example/woodmouse.fasta
#> Sequence type: Nucleotide
#> Keep MBits: T
#> Maximum file size: 1000000000B
#> Adding sequences from FASTA; added 15 sequences in 0.0124049 seconds.
#> $status
#> [1] 0
#> 
#> $stdout
#> [1] "\n\nBuilding a new DB, current time: 05/15/2019 16:40:37\nNew DB name:   /tmp/RtmpeNC9nF/baitfindR_example/wood\nNew DB title:  /tmp/RtmpeNC9nF/baitfindR_example/woodmouse.fasta\nSequence type: Nucleotide\nKeep MBits: T\nMaximum file size: 1000000000B\nAdding sequences from FASTA; added 15 sequences in 0.0124049 seconds.\n"
#> 
#> $stderr
#> [1] ""
#> 
#> $timeout
#> [1] FALSE
#> 

# Blast the original sequences against the database
blast_n_list(
  fasta_folder = temp_dir,
  fasta_pattern = "fasta",
  database_path = fs::path(temp_dir, "wood")
)
#> [1] "c90963e4b507281024a89cbb54d8074f"

# Take a look at the results.
readr::read_tsv(
  fs::path(temp_dir, "woodmouse.tsv"),
  col_names = FALSE
  )
#> Parsed with column specification:
#> cols(
#>   X1 = col_character(),
#>   X2 = col_character(),
#>   X3 = col_double(),
#>   X4 = col_double(),
#>   X5 = col_double(),
#>   X6 = col_double(),
#>   X7 = col_double(),
#>   X8 = col_double(),
#>   X9 = col_double(),
#>   X10 = col_double(),
#>   X11 = col_double(),
#>   X12 = col_double()
#> )
#> # A tibble: 225 x 12
#>    X1    X2         X3    X4    X5    X6    X7    X8    X9   X10   X11   X12
#>    <chr> <chr>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1 No305 No305   100     964     0     0     2   965     2   965     0  1773
#>  2 No305 No1103S  98.5   960    14     0     2   961     2   961     0  1700
#>  3 No305 No306    98.2   964    17     0     2   965     2   965     0  1694
#>  4 No305 No0912S  98.3   960    16     0     2   961     2   961     0  1688
#>  5 No305 No1206S  98.1   960    18     0     2   961     2   961     0  1677
#>  6 No305 No1202S  98.1   960    18     0     2   961     2   961     0  1677
#>  7 No305 No1007S  98.1   960    18     0     2   961     2   961     0  1677
#>  8 No305 No0909S  98.1   960    18     0     2   961     2   961     0  1677
#>  9 No305 No0908S  98.1   960    18     0     2   961     2   961     0  1677
#> 10 No305 No304    98.0   961    19     0     2   962     2   962     0  1677
#> # … with 215 more rows

readr::read_tsv(
  fs::path(temp_dir, "woodmouse2.tsv"),
  col_names = FALSE
  )
#> Parsed with column specification:
#> cols(
#>   X1 = col_character(),
#>   X2 = col_character(),
#>   X3 = col_double(),
#>   X4 = col_double(),
#>   X5 = col_double(),
#>   X6 = col_double(),
#>   X7 = col_double(),
#>   X8 = col_double(),
#>   X9 = col_double(),
#>   X10 = col_double(),
#>   X11 = col_double(),
#>   X12 = col_double()
#> )
#> # A tibble: 225 x 12
#>    X1    X2         X3    X4    X5    X6    X7    X8    X9   X10   X11   X12
#>    <chr> <chr>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1 No305 No305   100     964     0     0     2   965     2   965     0  1773
#>  2 No305 No1103S  98.5   960    14     0     2   961     2   961     0  1700
#>  3 No305 No306    98.2   964    17     0     2   965     2   965     0  1694
#>  4 No305 No0912S  98.3   960    16     0     2   961     2   961     0  1688
#>  5 No305 No1206S  98.1   960    18     0     2   961     2   961     0  1677
#>  6 No305 No1202S  98.1   960    18     0     2   961     2   961     0  1677
#>  7 No305 No1007S  98.1   960    18     0     2   961     2   961     0  1677
#>  8 No305 No0909S  98.1   960    18     0     2   961     2   961     0  1677
#>  9 No305 No0908S  98.1   960    18     0     2   961     2   961     0  1677
#> 10 No305 No304    98.0   961    19     0     2   962     2   962     0  1677
#> # … with 215 more rows

# Cleanup.
fs::file_delete(temp_dir)

Arguments

Value

References

Examples

Contents

Author