Translation

Background

This vignette demonstrates baitfindR functions for translating open reading frames (ORFs) from transcriptomes.

These functions require blast (2.6.0) and transdecoder (3.0.1) to be installed and on the user’s PATH. Other versions have not been tested and may not work properly. The best way to ensure the proper dependencies are available is to use the baitfindR docker image.

Setup

Most of these functions are wrappers for external programs that modify and write-out files on disk, unlike R functions that use objects in the R environment. So we will make a temporary directory for writing files, and check on what’s in there with list.files() after each step.

# Load packages.
library(baitfindR)
library(ape)
library(fs)

# Turn on echo flag to print messages from calls to external programs.
pkgconfig::set_config(`baitfindR::echo` = TRUE)

# Make temporary working directory to write files.
temp_dir <- fs::dir_create(fs::path(tempdir(), "baitfindR_example"))

Load data

Download the Arabidopsis thaliana proteome (14.7 mb). This will be used to detect candidate ORFs.

download.file(
  url = "https://www.arabidopsis.org/download_files/Sequences/TAIR10_blastsets/TAIR10_pep_20110103_representative_gene_model_updated",
  destfile = fs::path(temp_dir, "arabidopsis_prot.fasta")
)

# Check what we've written out.
list.files(temp_dir)
#> [1] "arabidopsis_prot.fasta"

Write out example fasta transcriptome: Asplenium nidus transcriptome from the 1KP project downsized to 5% of original size.

example_transcriptomes <- baitfindR::example_transcriptomes
PSKY <- example_transcriptomes$PSKY
ape::write.FASTA(PSKY, fs::path(temp_dir, "PSKY"))

list.files(temp_dir)
#> [1] "arabidopsis_prot.fasta" "PSKY"

Find ORFs

Construct blast database from Arabidopsis proteome.

blast_db <- baitfindR::build_blast_db(
  in_seqs = fs::path(temp_dir, "arabidopsis_prot.fasta"),
  out_name = "arabidopsis",
  parse_seqids = TRUE,
  db_type = "prot",
  wd = temp_dir
)
#> 
#> 
#> Building a new DB, current time: 05/15/2019 16:41:07
#> New DB name:   /tmp/RtmpXRNrLr/baitfindR_example/arabidopsis
#> New DB title:  /tmp/RtmpXRNrLr/baitfindR_example/arabidopsis_prot.fasta
#> Sequence type: Protein
#> Keep MBits: T
#> Maximum file size: 1000000000B
#> Adding sequences from FASTA; added 27416 sequences in 1.23689 seconds.

list.files(temp_dir)
#> [1] "arabidopsis_prot.fasta" "arabidopsis.phr"       
#> [3] "arabidopsis.pin"        "arabidopsis.pog"       
#> [5] "arabidopsis.psd"        "arabidopsis.psi"       
#> [7] "arabidopsis.psq"        "PSKY"

Find long ORFs in transcriptome.

long_orfs <- baitfindR::transdecoder_long_orfs(
  transcriptome_file = fs::path(temp_dir, "PSKY"),
  wd = temp_dir,
  other_args = "-S"
)
#> CMD: /usr/lib/transdecoder/util/compute_base_probs.pl /tmp/RtmpXRNrLr/baitfindR_example/PSKY 1 > PSKY.transdecoder_dir/base_freqs.dat
#> 
#> 
#> -first extracting base frequencies, we'll need them later.
#> CMD: touch PSKY.transdecoder_dir/base_freqs.dat.ok
#> 
#> 
#> - extracting ORFs from transcripts.
#> -total transcripts to examine: 2938
#> 
[100/2938] = 3.40% done    
[200/2938] = 6.81% done    
[300/2938] = 10.21% done    
[400/2938] = 13.61% done    
[500/2938] = 17.02% done    
[600/2938] = 20.42% done    
[700/2938] = 23.83% done    
[800/2938] = 27.23% done    
[900/2938] = 30.63% done    
[1000/2938] = 34.04% done    
[1100/2938] = 37.44% done    
[1200/2938] = 40.84% done    
[1300/2938] = 44.25% done    
[1400/2938] = 47.65% done    
[1500/2938] = 51.06% done    
[1600/2938] = 54.46% done    
[1700/2938] = 57.86% done    
[1800/2938] = 61.27% done    
[1900/2938] = 64.67% done    
[2000/2938] = 68.07% done    
[2100/2938] = 71.48% done    
[2200/2938] = 74.88% done    
[2300/2938] = 78.28% done    
[2400/2938] = 81.69% done    
[2500/2938] = 85.09% done    
[2600/2938] = 88.50% done    
[2700/2938] = 91.90% done    
[2800/2938] = 95.30% done    
[2900/2938] = 98.71% done    
#> 
#> #################################
#> ### Done preparing long ORFs.  ###
#> ##################################
#> 
#>  Use file: PSKY.transdecoder_dir/longest_orfs.pep  for Pfam and/or BlastP searches to enable homology-based coding region identification.
#> 
#>  Then, run TransDecoder.Predict for your final coding region predictions.

list.files(temp_dir)
#> [1] "arabidopsis_prot.fasta" "arabidopsis.phr"       
#> [3] "arabidopsis.pin"        "arabidopsis.pog"       
#> [5] "arabidopsis.psd"        "arabidopsis.psi"       
#> [7] "arabidopsis.psq"        "PSKY"                  
#> [9] "PSKY.transdecoder_dir"

Blast candidate long ORFs against the protein database (takes 2-3 minutes).

blast_p_results <- baitfindR::blast_p(
  query = fs::path(temp_dir, "PSKY.transdecoder_dir/longest_orfs.pep"),
  database = "arabidopsis",
  wd = fs::path(temp_dir),
  other_args = c("-max_target_seqs", 1, "-evalue", 10, "-num_threads", 2),
  out_file = fs::path(temp_dir, "PSKY.blastp.outfmt6")
)

list.files(temp_dir)
#>  [1] "arabidopsis_prot.fasta" "arabidopsis.phr"       
#>  [3] "arabidopsis.pin"        "arabidopsis.pog"       
#>  [5] "arabidopsis.psd"        "arabidopsis.psi"       
#>  [7] "arabidopsis.psq"        "PSKY"                  
#>  [9] "PSKY.blastp.outfmt6"    "PSKY.transdecoder_dir"

Output the final ORFs, preferentially retaining those with blast hits.

predict_results <- baitfindR::transdecoder_predict(
  transcriptome_file = fs::path(temp_dir, "PSKY"),
  blast_result = fs::path(temp_dir, "PSKY.blastp.outfmt6"),
  wd = temp_dir,
  echo = FALSE # transdecoder predict produces really long output
)

list.files(temp_dir)
#>  [1] "arabidopsis_prot.fasta" "arabidopsis.phr"       
#>  [3] "arabidopsis.pin"        "arabidopsis.pog"       
#>  [5] "arabidopsis.psd"        "arabidopsis.psi"       
#>  [7] "arabidopsis.psq"        "PSKY"                  
#>  [9] "PSKY.blastp.outfmt6"    "PSKY.transdecoder_dir" 
#> [11] "PSKY.transdecoder.bed"  "PSKY.transdecoder.cds" 
#> [13] "PSKY.transdecoder.gff3" "PSKY.transdecoder.pep"

Cleanup

fs::file_delete(temp_dir)

Joel Nitta

2019-05-15

Background

Setup

Load data

Find ORFs

Cleanup

Contents