Top hits and original fasta files are matched based on the first part of the filename separated by periods (i.e., the filename without any extension).

realign_with_best_hits(best_hits_dir, best_hits_pattern = "bestmatch",
  fasta_dir, fasta_pattern = "\\.fa$", ...)

Arguments

best_hits_dir

Path to directory containing top blast hits.

best_hits_pattern

Pattern used for matching with grep. Only files with names matching the pattern will be included as the top blast hit.

fasta_dir

Path to directory containing fasta files for realignment.

fasta_pattern

Pattern used for matching with grep. Only files with names matching the pattern will be included for realignment.

...

Additional other arguments. Not used by this function, but meant to be used by drake_plan for tracking during workflows.

Value

List of lists, each of which is of class `DNAbin`.

Examples

library(ape) # Make temp dir for storing files temp_dir <- fs::dir_create(fs::path(tempdir(), "baitfindR_example")) # Write out ape::woodmouse dataset as DNA data(woodmouse) ape::write.FASTA(woodmouse, fs::path(temp_dir, "woodmouse.fasta")) ape::write.FASTA(woodmouse, fs::path(temp_dir, "woodmouse2.fasta")) # Make blast database build_blast_db( fs::path(temp_dir, "woodmouse.fasta"), db_type = "nucl", out_name = "wood", parse_seqids = TRUE, wd = temp_dir)
#> #> #> Building a new DB, current time: 05/15/2019 16:40:51 #> New DB name: /tmp/RtmpeNC9nF/baitfindR_example/wood #> New DB title: /tmp/RtmpeNC9nF/baitfindR_example/woodmouse.fasta #> Sequence type: Nucleotide #> Keep MBits: T #> Maximum file size: 1000000000B #> Adding sequences from FASTA; added 15 sequences in 0.018368 seconds.
#> $status #> [1] 0 #> #> $stdout #> [1] "\n\nBuilding a new DB, current time: 05/15/2019 16:40:51\nNew DB name: /tmp/RtmpeNC9nF/baitfindR_example/wood\nNew DB title: /tmp/RtmpeNC9nF/baitfindR_example/woodmouse.fasta\nSequence type: Nucleotide\nKeep MBits: T\nMaximum file size: 1000000000B\nAdding sequences from FASTA; added 15 sequences in 0.018368 seconds.\n" #> #> $stderr #> [1] "" #> #> $timeout #> [1] FALSE #>
# Blast the original sequences against the database blast_n_list( fasta_folder = temp_dir, fasta_pattern = "fasta", database_path = fs::path(temp_dir, "wood") )
#> [1] "c90963e4b507281024a89cbb54d8074f"
# Extract the top BLAST hit for each fasta file. extract_blast_hits( blast_results_dir = temp_dir, blast_results_pattern = "\\.tsv$", database_path = fs::path(temp_dir, "wood"), out_dir = temp_dir, out_ext = "bestmatch" )
#> Parsed with column specification: #> cols( #> qseqid = col_character(), #> sseqid = col_character(), #> pident = col_double(), #> length = col_double(), #> mismatch = col_double(), #> gapopen = col_double(), #> qstart = col_double(), #> qend = col_double(), #> sstart = col_double(), #> send = col_double(), #> evalue = col_double(), #> bitscore = col_double() #> )
#> Parsed with column specification: #> cols( #> qseqid = col_character(), #> sseqid = col_character(), #> pident = col_double(), #> length = col_double(), #> mismatch = col_double(), #> gapopen = col_double(), #> qstart = col_double(), #> qend = col_double(), #> sstart = col_double(), #> send = col_double(), #> evalue = col_double(), #> bitscore = col_double() #> )
#> [1] TRUE
realign_with_best_hits( best_hits_dir = temp_dir, best_hits_pattern = "bestmatch", fasta_dir = temp_dir, fasta_pattern = "fasta" )
#> $woodmouse #> 16 DNA sequences in binary format stored in a matrix. #> #> All sequences of same length: 965 #> #> Labels: #> No305 #> No304 #> No306 #> No0906S #> No0908S #> No0909S #> ... #> #> Base composition: #> a c g t #> 0.307 0.261 0.126 0.306 #> (Total: 15.44 kb) #> #> $woodmouse2 #> 16 DNA sequences in binary format stored in a matrix. #> #> All sequences of same length: 965 #> #> Labels: #> No305 #> No304 #> No306 #> No0906S #> No0908S #> No0909S #> ... #> #> Base composition: #> a c g t #> 0.307 0.261 0.126 0.306 #> (Total: 15.44 kb) #>
# Cleanup. fs::file_delete(temp_dir)