R/blast.R
realign_with_best_hits.RdTop hits and original fasta files are matched based on the first part of the filename separated by periods (i.e., the filename without any extension).
realign_with_best_hits(best_hits_dir, best_hits_pattern = "bestmatch", fasta_dir, fasta_pattern = "\\.fa$", ...)
| best_hits_dir | Path to directory containing top blast hits. |
|---|---|
| best_hits_pattern | Pattern used for matching with grep. Only files with names matching the pattern will be included as the top blast hit. |
| fasta_dir | Path to directory containing fasta files for realignment. |
| fasta_pattern | Pattern used for matching with grep. Only files with names matching the pattern will be included for realignment. |
| ... | Additional other arguments. Not used by this function,
but meant to be used by |
List of lists, each of which is of class `DNAbin`.
library(ape) # Make temp dir for storing files temp_dir <- fs::dir_create(fs::path(tempdir(), "baitfindR_example")) # Write out ape::woodmouse dataset as DNA data(woodmouse) ape::write.FASTA(woodmouse, fs::path(temp_dir, "woodmouse.fasta")) ape::write.FASTA(woodmouse, fs::path(temp_dir, "woodmouse2.fasta")) # Make blast database build_blast_db( fs::path(temp_dir, "woodmouse.fasta"), db_type = "nucl", out_name = "wood", parse_seqids = TRUE, wd = temp_dir)#> #> #> Building a new DB, current time: 05/15/2019 16:40:51 #> New DB name: /tmp/RtmpeNC9nF/baitfindR_example/wood #> New DB title: /tmp/RtmpeNC9nF/baitfindR_example/woodmouse.fasta #> Sequence type: Nucleotide #> Keep MBits: T #> Maximum file size: 1000000000B #> Adding sequences from FASTA; added 15 sequences in 0.018368 seconds.#> $status #> [1] 0 #> #> $stdout #> [1] "\n\nBuilding a new DB, current time: 05/15/2019 16:40:51\nNew DB name: /tmp/RtmpeNC9nF/baitfindR_example/wood\nNew DB title: /tmp/RtmpeNC9nF/baitfindR_example/woodmouse.fasta\nSequence type: Nucleotide\nKeep MBits: T\nMaximum file size: 1000000000B\nAdding sequences from FASTA; added 15 sequences in 0.018368 seconds.\n" #> #> $stderr #> [1] "" #> #> $timeout #> [1] FALSE #># Blast the original sequences against the database blast_n_list( fasta_folder = temp_dir, fasta_pattern = "fasta", database_path = fs::path(temp_dir, "wood") )#> [1] "c90963e4b507281024a89cbb54d8074f"# Extract the top BLAST hit for each fasta file. extract_blast_hits( blast_results_dir = temp_dir, blast_results_pattern = "\\.tsv$", database_path = fs::path(temp_dir, "wood"), out_dir = temp_dir, out_ext = "bestmatch" )#> #> #>#> #> #> col_character(), #> col_character(), #> col_double(), #> col_double(), #> col_double(), #> col_double(), #> col_double(), #> col_double(), #> col_double(), #> col_double(), #> col_double(), #> col_double() #>#> [1] TRUErealign_with_best_hits( best_hits_dir = temp_dir, best_hits_pattern = "bestmatch", fasta_dir = temp_dir, fasta_pattern = "fasta" )#> $woodmouse #> 16 DNA sequences in binary format stored in a matrix. #> #> All sequences of same length: 965 #> #> Labels: #> No305 #> No304 #> No306 #> No0906S #> No0908S #> No0909S #> ... #> #> Base composition: #> a c g t #> 0.307 0.261 0.126 0.306 #> (Total: 15.44 kb) #> #> $woodmouse2 #> 16 DNA sequences in binary format stored in a matrix. #> #> All sequences of same length: 965 #> #> Labels: #> No305 #> No304 #> No306 #> No0906S #> No0908S #> No0909S #> ... #> #> Base composition: #> a c g t #> 0.307 0.261 0.126 0.306 #> (Total: 15.44 kb) #>