This vignette demonstrates baitfindR functions for translating open reading frames (ORFs) from transcriptomes.
These functions require blast (2.6.0) and transdecoder (3.0.1) to be installed and on the user’s PATH. Other versions have not been tested and may not work properly. The best way to ensure the proper dependencies are available is to use the baitfindR docker image.
Most of these functions are wrappers for external programs that modify and write-out files on disk, unlike R functions that use objects in the R environment. So we will make a temporary directory for writing files, and check on what’s in there with list.files() after each step.
Download the Arabidopsis thaliana proteome (14.7 mb). This will be used to detect candidate ORFs.
download.file(
url = "https://www.arabidopsis.org/download_files/Sequences/TAIR10_blastsets/TAIR10_pep_20110103_representative_gene_model_updated",
destfile = fs::path(temp_dir, "arabidopsis_prot.fasta")
)
# Check what we've written out.
list.files(temp_dir)
#> [1] "arabidopsis_prot.fasta"Write out example fasta transcriptome: Asplenium nidus transcriptome from the 1KP project downsized to 5% of original size.
Construct blast database from Arabidopsis proteome.
blast_db <- baitfindR::build_blast_db(
in_seqs = fs::path(temp_dir, "arabidopsis_prot.fasta"),
out_name = "arabidopsis",
parse_seqids = TRUE,
db_type = "prot",
wd = temp_dir
)
#>
#>
#> Building a new DB, current time: 05/15/2019 16:41:07
#> New DB name: /tmp/RtmpXRNrLr/baitfindR_example/arabidopsis
#> New DB title: /tmp/RtmpXRNrLr/baitfindR_example/arabidopsis_prot.fasta
#> Sequence type: Protein
#> Keep MBits: T
#> Maximum file size: 1000000000B
#> Adding sequences from FASTA; added 27416 sequences in 1.23689 seconds.
list.files(temp_dir)
#> [1] "arabidopsis_prot.fasta" "arabidopsis.phr"
#> [3] "arabidopsis.pin" "arabidopsis.pog"
#> [5] "arabidopsis.psd" "arabidopsis.psi"
#> [7] "arabidopsis.psq" "PSKY"Find long ORFs in transcriptome.
long_orfs <- baitfindR::transdecoder_long_orfs(
transcriptome_file = fs::path(temp_dir, "PSKY"),
wd = temp_dir,
other_args = "-S"
)
#> CMD: /usr/lib/transdecoder/util/compute_base_probs.pl /tmp/RtmpXRNrLr/baitfindR_example/PSKY 1 > PSKY.transdecoder_dir/base_freqs.dat
#>
#>
#> -first extracting base frequencies, we'll need them later.
#> CMD: touch PSKY.transdecoder_dir/base_freqs.dat.ok
#>
#>
#> - extracting ORFs from transcripts.
#> -total transcripts to examine: 2938
#>
[100/2938] = 3.40% done
[200/2938] = 6.81% done
[300/2938] = 10.21% done
[400/2938] = 13.61% done
[500/2938] = 17.02% done
[600/2938] = 20.42% done
[700/2938] = 23.83% done
[800/2938] = 27.23% done
[900/2938] = 30.63% done
[1000/2938] = 34.04% done
[1100/2938] = 37.44% done
[1200/2938] = 40.84% done
[1300/2938] = 44.25% done
[1400/2938] = 47.65% done
[1500/2938] = 51.06% done
[1600/2938] = 54.46% done
[1700/2938] = 57.86% done
[1800/2938] = 61.27% done
[1900/2938] = 64.67% done
[2000/2938] = 68.07% done
[2100/2938] = 71.48% done
[2200/2938] = 74.88% done
[2300/2938] = 78.28% done
[2400/2938] = 81.69% done
[2500/2938] = 85.09% done
[2600/2938] = 88.50% done
[2700/2938] = 91.90% done
[2800/2938] = 95.30% done
[2900/2938] = 98.71% done
#>
#> #################################
#> ### Done preparing long ORFs. ###
#> ##################################
#>
#> Use file: PSKY.transdecoder_dir/longest_orfs.pep for Pfam and/or BlastP searches to enable homology-based coding region identification.
#>
#> Then, run TransDecoder.Predict for your final coding region predictions.
list.files(temp_dir)
#> [1] "arabidopsis_prot.fasta" "arabidopsis.phr"
#> [3] "arabidopsis.pin" "arabidopsis.pog"
#> [5] "arabidopsis.psd" "arabidopsis.psi"
#> [7] "arabidopsis.psq" "PSKY"
#> [9] "PSKY.transdecoder_dir"Blast candidate long ORFs against the protein database (takes 2-3 minutes).
blast_p_results <- baitfindR::blast_p(
query = fs::path(temp_dir, "PSKY.transdecoder_dir/longest_orfs.pep"),
database = "arabidopsis",
wd = fs::path(temp_dir),
other_args = c("-max_target_seqs", 1, "-evalue", 10, "-num_threads", 2),
out_file = fs::path(temp_dir, "PSKY.blastp.outfmt6")
)
list.files(temp_dir)
#> [1] "arabidopsis_prot.fasta" "arabidopsis.phr"
#> [3] "arabidopsis.pin" "arabidopsis.pog"
#> [5] "arabidopsis.psd" "arabidopsis.psi"
#> [7] "arabidopsis.psq" "PSKY"
#> [9] "PSKY.blastp.outfmt6" "PSKY.transdecoder_dir"Output the final ORFs, preferentially retaining those with blast hits.
predict_results <- baitfindR::transdecoder_predict(
transcriptome_file = fs::path(temp_dir, "PSKY"),
blast_result = fs::path(temp_dir, "PSKY.blastp.outfmt6"),
wd = temp_dir,
echo = FALSE # transdecoder predict produces really long output
)
list.files(temp_dir)
#> [1] "arabidopsis_prot.fasta" "arabidopsis.phr"
#> [3] "arabidopsis.pin" "arabidopsis.pog"
#> [5] "arabidopsis.psd" "arabidopsis.psi"
#> [7] "arabidopsis.psq" "PSKY"
#> [9] "PSKY.blastp.outfmt6" "PSKY.transdecoder_dir"
#> [11] "PSKY.transdecoder.bed" "PSKY.transdecoder.cds"
#> [13] "PSKY.transdecoder.gff3" "PSKY.transdecoder.pep"