Calculate transcript abundances

Last updated: 2017-03-28

Code version: 0e2e80a

We can use several different methods to calculate transcript abundances. Here I will use Stringtie and HTSeq to calculate TPM values.

Running stringtie

First, we make sure we have the appropriate annotation file:

cat data/annotations/exons_nuclear_3D7_v24.gff | \
  gffread -F -E -o- > output/transcript_abundance/stringtie_annotation.gff

Now let’s run Stringtie:

for strain in 3d7 hb3 it; do
  for i in $(seq 1 7); do
    for s in rf fr; do
      stringtie -G output/transcript_abundance/annotation.gff \
        -e \
        --"${s}" \
        -p 8 \
        -A output/transcript_abundance/stringtie/"${strain}_tp${i}_${s}.abund" \
        "data/bam/mapped_to_3d7/${strain}.3d7_v3_chr.tp${i}.bam" > \
        "output/transcript_abundance/stringtie/${strain}_tp${i}_${s}.gtf"
    done
  done
done

Running HTSeq

We’ll also quantify read counts using HTSeq. First generate another annotation since HTSeq likes using GTF files:

cat data/annotations/exons_nuclear_3D7_v24.gff | \
  gffread -F -E -T -o- > output/transcript_abundance/htseq_annotation.gtf

And then run HTSeq using the following command:

for strain in 3d7 hb3 it; do
  for i in $(seq 1 7); do
    for s in yes reverse; do
      htseq-count --format=bam --order=pos --stranded=${s} --minaqual=30 --idattr=transcript_id --mode=union \
        "data/bam/mapped_to_3d7/${strain}.3d7_v3_chr.tp${i}.bam" \
        "output/transcript_abundance/htseq_annotation.gtf" > \
        "output/transcript_abundance/htseq/${strain}_tp${i}_${s}.txt"
    done
  done
done

Clean and join the results

Now we’ll just import the results and reformat them for future use.

Stringtie

Here we’ll import the Stringtie data:

# For each strain
for (strain in c("3d7","hb3","it")) {
  # create a tibble with the right columns
  assign(x     = paste0("x",strain,"_tpm"),
         value = tibble::tibble(transcript_id=character(),
                      cov=double(),
                      FPKM=double(),
                      TPM=double(),
                      tp=integer(),
                      strain=character()))
  # for each timepoint
  for (tp in seq(1,7)) {
    # generate file path
    f <- paste0("../output/transcript_abundance/stringtie/",strain,"_tp",tp,"_rf.gtf")
    # import the file and convert to the right format
    df <- tibble::as_tibble(import.gff(f)) %>% 
            dplyr::filter(type=="transcript") %>% 
            dplyr::select(transcript_id,cov,FPKM,TPM) %>% 
            dplyr::mutate(cov=as.double(cov),FPKM=as.double(FPKM),TPM=as.double(TPM),tp=tp,strain=strain) %>%
            dplyr::mutate(transcript_id=stringr::str_replace(transcript_id,"rna_","")) %>%
            dplyr::mutate(transcript_id=stringr::str_replace(transcript_id,"-1$",""))
    # bind this data to the tibble we created earlier for each strain
    assign(x     = paste0("x",strain,"_tpm"),
           value = dplyr::bind_rows(eval(parse(text=paste0("x",strain,"_tpm"))),df))
  }
}
# clean up
rm(df)

And let’s save these to output:

# Sense and antisense counts in long format
saveRDS(object=x3d7_tpm,file="../output/transcript_abundance/stringtie_3d7_abund.rds")
saveRDS(object=xhb3_tpm,file="../output/transcript_abundance/stringtie_hb3_abund.rds")
saveRDS(object=xit_tpm,file="../output/transcript_abundance/stringtie_it_abund.rds")

HTSeq

In order to properly calculate TPMs, we need to calculate the effective transcript lengths. To do this we need the mean or median fragment length distribution for each sample. We can generate this data using picard:

for file in $(find data/bam/mapped_to_3d7/ -name *.bam); do 
  picard CollectInsertSizeMetrics INPUT=$file HISTOGRAM_FILE=${file%.*}_insert_size.hist OUTPUT=${file%.*}_insert_size.out; 
done

Now we can read each file in and record the mean fragment length:

# create a tibble with the right columns
inserts <- tibble(sample=character(),mean_insert_size=double(),median_insert_size=double(),tp=integer())
# for each strain
for (strain in c("3d7","hb3","it")) {
  for (tp in seq(1,7)) {
    # generate the file path
    f <- paste0("../data/bam/mapped_to_3d7/",strain,".3d7_v3_chr.tp",tp,"_insert_size.out")
    # extract mean and median insert sizes
    mean_is <- read_tsv(f,skip=6,n_max = 1)$MEAN_INSERT_SIZE
    median_is <- read_tsv(f,skip=6,n_max = 1)$MEDIAN_INSERT_SIZE
    # bind this row to the tibble we created earlier
    inserts <- dplyr::bind_rows(inserts,tibble(sample=paste0("x",strain,"_tp",tp),mean_insert_size=mean_is,median_insert_size=median_is,tp=tp))
  }
}

It turns out, however, that using effective transcript lengths in Plasmodium is problematic…Some transcripts end up with a negative length if we use the formula provided for us here. So we’ll simply use full transcript lengths instead of “correcting” for that bias.

Import the HTSeq data:

# import our calculate transcript lengths
transcript_lengths <- read_tsv("../output/transcript_abundance/transcript_lengths.tsv")

# for each strain
for (strain in c("3d7","hb3","it")) {
  # create a tibble with the right colums
  assign(x     = paste0("x",strain,"_counts"),
         value = tibble(transcript_id=character(),
                        sense_counts=double(),
                        antisense_counts=double()))
  # for each time point
  for (tp in seq(1,7)) {
    # generate the file names
    s <- paste0("../output/transcript_abundance/htseq/",strain,"_tp",tp,"_reverse.txt")
    a <- paste0("../output/transcript_abundance/htseq/",strain,"_tp",tp,"_yes.txt")
    # and import sense and antisense counts and process their raw values
    st <- read_tsv(s,comment="__",col_names=c("transcript_id","sense_counts")) %>%
      dplyr::mutate(transcript_id=stringr::str_replace(transcript_id,"rna_","")) %>%
      dplyr::mutate(transcript_id=stringr::str_replace(transcript_id,"-1$",""))
    at <- read_tsv(a,comment="__",col_names=c("transcript_id","antisense_counts")) %>%       
      dplyr::mutate(transcript_id=stringr::str_replace(transcript_id,"rna_","")) %>%
      dplyr::mutate(transcript_id=stringr::str_replace(transcript_id,"-1$",""))
    # join the sense and antisense counts and give it an extra column
    df <- inner_join(st,at) %>% mutate(tp=tp)
    # bind the rows to the strain specific tibble generated above
    assign(x     = paste0("x",strain,"_counts"),
           value = dplyr::bind_rows(eval(parse(text=paste0("x",strain,"_counts"))),df))
  }
  # join this tibble with our transcript lengths for convenient TPM calculations
   assign(x     = paste0("x",strain,"_counts"),
           value = dplyr::inner_join(eval(parse(text=paste0("x",strain,"_counts"))),transcript_lengths))
}
# clean up
rm(s,a,st,at,df)

Calculate TPM values:

# Function for calculating TPMs from the count tibbles
# generated above
# 
calc_tpms <- function(counts) {
  # caluclate the normalizing factor for each sample
  # normFactor = sum of all length normalized transcript counts
  normFactors <- counts %>% 
  mutate(normCount=sense_counts/length) %>% 
  group_by(tp) %>% 
  summarise(normFactor=1/sum(normCount))
  
  # calculate the TPMs by joining it together with normFactors
  # and multiplying the length corrected counts by these factors
  # and by one million
  abund <- inner_join(normFactors,counts) %>%
  mutate(sense_tpm=(sense_counts/length)*normFactor*1e6,antisense_tpm=(antisense_counts/length)*normFactor*1e6) %>%
  dplyr::select(transcript_id,sense_counts,antisense_counts,sense_tpm,antisense_tpm,tp)
  
  return(abund)
}

# calculate the TPMs for each strain
x3d7_abund <- calc_tpms(x3d7_counts)
xhb3_abund <- calc_tpms(xhb3_counts)
xit_abund  <- calc_tpms(xit_counts)

Let’s just make sure they all add up to a million as they should:

# sum up TPM values for each time point
x3d7_abund %>% group_by(tp) %>% summarise(total=sum(sense_tpm))

# A tibble: 7 × 2
     tp total
  <int> <dbl>
1     1 1e+06
2     2 1e+06
3     3 1e+06
4     4 1e+06
5     5 1e+06
6     6 1e+06
7     7 1e+06

xhb3_abund %>% group_by(tp) %>% summarise(total=sum(sense_tpm))

# A tibble: 7 × 2
     tp total
  <int> <dbl>
1     1 1e+06
2     2 1e+06
3     3 1e+06
4     4 1e+06
5     5 1e+06
6     6 1e+06
7     7 1e+06

xit_abund %>% group_by(tp) %>% summarise(total=sum(sense_tpm))

# A tibble: 7 × 2
     tp total
  <int> <dbl>
1     1 1e+06
2     2 1e+06
3     3 1e+06
4     4 1e+06
5     5 1e+06
6     6 1e+06
7     7 1e+06

Write these abundance values to output:

# Sense TPM tables
select(x3d7_abund,transcript_id,sense_tpm,tp) %>% spread(tp,sense_tpm) %>%
  write_tsv(path="../output/transcript_abundance/htseq_3d7_sense_tpms.tsv")
select(xhb3_abund,transcript_id,sense_tpm,tp) %>% spread(tp,sense_tpm) %>%
  write_tsv(path="../output/transcript_abundance/htseq_hb3_sense_tpms.tsv")
select(xit_abund,transcript_id,sense_tpm,tp) %>% spread(tp,sense_tpm) %>%
  write_tsv(path="../output/transcript_abundance/htseq_it_sense_tpms.tsv")

# Antisense TPM tables
select(x3d7_abund,transcript_id,antisense_tpm,tp) %>% spread(tp,antisense_tpm) %>%
  write_tsv(path="../output/transcript_abundance/htseq_3d7_antisense_tpms.tsv")
select(xhb3_abund,transcript_id,antisense_tpm,tp) %>% spread(tp,antisense_tpm) %>%
  write_tsv(path="../output/transcript_abundance/htseq_hb3_antisense_tpms.tsv")
select(xit_abund,transcript_id,antisense_tpm,tp) %>% spread(tp,antisense_tpm) %>%
  write_tsv(path="../output/transcript_abundance/htseq_it_antisense_tpms.tsv")

# Sense and antisense counts in long format
saveRDS(object=x3d7_abund,file="../output/transcript_abundance/htseq_3d7_abund.rds")
saveRDS(object=xhb3_abund,file="../output/transcript_abundance/htseq_hb3_abund.rds")
saveRDS(object=xit_abund,file="../output/transcript_abundance/htseq_it_abund.rds")

That’s it! Now we can use these files for future analyses.

Session Information

sessionInfo()

R version 3.3.2 (2016-10-31)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: OS X El Capitan 10.11.6

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] rtracklayer_1.34.2   GenomicRanges_1.26.3 GenomeInfoDb_1.10.3 
 [4] IRanges_2.8.1        S4Vectors_0.12.1     BiocGenerics_0.20.0 
 [7] dplyr_0.5.0          purrr_0.2.2          readr_1.0.0         
[10] tidyr_0.6.1          tibble_1.2           ggplot2_2.2.1       
[13] tidyverse_1.1.1     

loaded via a namespace (and not attached):
 [1] SummarizedExperiment_1.4.0 reshape2_1.4.2            
 [3] haven_1.0.0                lattice_0.20-34           
 [5] colorspace_1.3-2           htmltools_0.3.5           
 [7] yaml_2.1.14                XML_3.98-1.5              
 [9] foreign_0.8-67             DBI_0.5-1                 
[11] BiocParallel_1.8.1         modelr_0.1.0              
[13] readxl_0.1.1               plyr_1.8.4                
[15] stringr_1.2.0              zlibbioc_1.20.0           
[17] Biostrings_2.42.1          workflowr_0.3.0           
[19] munsell_0.4.3              gtable_0.2.0              
[21] rvest_0.3.2                psych_1.6.12              
[23] evaluate_0.10              Biobase_2.34.0            
[25] knitr_1.15.1               forcats_0.2.0             
[27] broom_0.4.2                Rcpp_0.12.9               
[29] scales_0.4.1               backports_1.0.5           
[31] jsonlite_1.3               XVector_0.14.0            
[33] Rsamtools_1.26.1           mnormt_1.5-5              
[35] hms_0.3                    digest_0.6.12             
[37] stringi_1.1.2              grid_3.3.2                
[39] rprojroot_1.2              tools_3.3.2               
[41] bitops_1.0-6               magrittr_1.5              
[43] lazyeval_0.2.0             RCurl_1.95-4.8            
[45] Matrix_1.2-8               xml2_1.1.1                
[47] lubridate_1.6.0            assertthat_0.1            
[49] rmarkdown_1.3              httr_1.2.1                
[51] R6_2.2.0                   git2r_0.18.0              
[53] GenomicAlignments_1.10.0   nlme_3.1-131

This R Markdown site was created with workflowr