Last updated: 2017-03-28
Code version: 0e2e80a
We can use several different methods to calculate transcript abundances. Here I will use Stringtie and HTSeq to calculate TPM values.
First, we make sure we have the appropriate annotation file:
cat data/annotations/exons_nuclear_3D7_v24.gff | \
gffread -F -E -o- > output/transcript_abundance/stringtie_annotation.gff
Now let’s run Stringtie:
for strain in 3d7 hb3 it; do
for i in $(seq 1 7); do
for s in rf fr; do
stringtie -G output/transcript_abundance/annotation.gff \
-e \
--"${s}" \
-p 8 \
-A output/transcript_abundance/stringtie/"${strain}_tp${i}_${s}.abund" \
"data/bam/mapped_to_3d7/${strain}.3d7_v3_chr.tp${i}.bam" > \
"output/transcript_abundance/stringtie/${strain}_tp${i}_${s}.gtf"
done
done
done
We’ll also quantify read counts using HTSeq. First generate another annotation since HTSeq likes using GTF files:
cat data/annotations/exons_nuclear_3D7_v24.gff | \
gffread -F -E -T -o- > output/transcript_abundance/htseq_annotation.gtf
And then run HTSeq using the following command:
for strain in 3d7 hb3 it; do
for i in $(seq 1 7); do
for s in yes reverse; do
htseq-count --format=bam --order=pos --stranded=${s} --minaqual=30 --idattr=transcript_id --mode=union \
"data/bam/mapped_to_3d7/${strain}.3d7_v3_chr.tp${i}.bam" \
"output/transcript_abundance/htseq_annotation.gtf" > \
"output/transcript_abundance/htseq/${strain}_tp${i}_${s}.txt"
done
done
done
Now we’ll just import the results and reformat them for future use.
Here we’ll import the Stringtie data:
# For each strain
for (strain in c("3d7","hb3","it")) {
# create a tibble with the right columns
assign(x = paste0("x",strain,"_tpm"),
value = tibble::tibble(transcript_id=character(),
cov=double(),
FPKM=double(),
TPM=double(),
tp=integer(),
strain=character()))
# for each timepoint
for (tp in seq(1,7)) {
# generate file path
f <- paste0("../output/transcript_abundance/stringtie/",strain,"_tp",tp,"_rf.gtf")
# import the file and convert to the right format
df <- tibble::as_tibble(import.gff(f)) %>%
dplyr::filter(type=="transcript") %>%
dplyr::select(transcript_id,cov,FPKM,TPM) %>%
dplyr::mutate(cov=as.double(cov),FPKM=as.double(FPKM),TPM=as.double(TPM),tp=tp,strain=strain) %>%
dplyr::mutate(transcript_id=stringr::str_replace(transcript_id,"rna_","")) %>%
dplyr::mutate(transcript_id=stringr::str_replace(transcript_id,"-1$",""))
# bind this data to the tibble we created earlier for each strain
assign(x = paste0("x",strain,"_tpm"),
value = dplyr::bind_rows(eval(parse(text=paste0("x",strain,"_tpm"))),df))
}
}
# clean up
rm(df)
And let’s save these to output:
# Sense and antisense counts in long format
saveRDS(object=x3d7_tpm,file="../output/transcript_abundance/stringtie_3d7_abund.rds")
saveRDS(object=xhb3_tpm,file="../output/transcript_abundance/stringtie_hb3_abund.rds")
saveRDS(object=xit_tpm,file="../output/transcript_abundance/stringtie_it_abund.rds")
In order to properly calculate TPMs, we need to calculate the effective transcript lengths. To do this we need the mean or median fragment length distribution for each sample. We can generate this data using picard:
for file in $(find data/bam/mapped_to_3d7/ -name *.bam); do
picard CollectInsertSizeMetrics INPUT=$file HISTOGRAM_FILE=${file%.*}_insert_size.hist OUTPUT=${file%.*}_insert_size.out;
done
Now we can read each file in and record the mean fragment length:
# create a tibble with the right columns
inserts <- tibble(sample=character(),mean_insert_size=double(),median_insert_size=double(),tp=integer())
# for each strain
for (strain in c("3d7","hb3","it")) {
for (tp in seq(1,7)) {
# generate the file path
f <- paste0("../data/bam/mapped_to_3d7/",strain,".3d7_v3_chr.tp",tp,"_insert_size.out")
# extract mean and median insert sizes
mean_is <- read_tsv(f,skip=6,n_max = 1)$MEAN_INSERT_SIZE
median_is <- read_tsv(f,skip=6,n_max = 1)$MEDIAN_INSERT_SIZE
# bind this row to the tibble we created earlier
inserts <- dplyr::bind_rows(inserts,tibble(sample=paste0("x",strain,"_tp",tp),mean_insert_size=mean_is,median_insert_size=median_is,tp=tp))
}
}
It turns out, however, that using effective transcript lengths in Plasmodium is problematic…Some transcripts end up with a negative length if we use the formula provided for us here. So we’ll simply use full transcript lengths instead of “correcting” for that bias.
Import the HTSeq data:
# import our calculate transcript lengths
transcript_lengths <- read_tsv("../output/transcript_abundance/transcript_lengths.tsv")
# for each strain
for (strain in c("3d7","hb3","it")) {
# create a tibble with the right colums
assign(x = paste0("x",strain,"_counts"),
value = tibble(transcript_id=character(),
sense_counts=double(),
antisense_counts=double()))
# for each time point
for (tp in seq(1,7)) {
# generate the file names
s <- paste0("../output/transcript_abundance/htseq/",strain,"_tp",tp,"_reverse.txt")
a <- paste0("../output/transcript_abundance/htseq/",strain,"_tp",tp,"_yes.txt")
# and import sense and antisense counts and process their raw values
st <- read_tsv(s,comment="__",col_names=c("transcript_id","sense_counts")) %>%
dplyr::mutate(transcript_id=stringr::str_replace(transcript_id,"rna_","")) %>%
dplyr::mutate(transcript_id=stringr::str_replace(transcript_id,"-1$",""))
at <- read_tsv(a,comment="__",col_names=c("transcript_id","antisense_counts")) %>%
dplyr::mutate(transcript_id=stringr::str_replace(transcript_id,"rna_","")) %>%
dplyr::mutate(transcript_id=stringr::str_replace(transcript_id,"-1$",""))
# join the sense and antisense counts and give it an extra column
df <- inner_join(st,at) %>% mutate(tp=tp)
# bind the rows to the strain specific tibble generated above
assign(x = paste0("x",strain,"_counts"),
value = dplyr::bind_rows(eval(parse(text=paste0("x",strain,"_counts"))),df))
}
# join this tibble with our transcript lengths for convenient TPM calculations
assign(x = paste0("x",strain,"_counts"),
value = dplyr::inner_join(eval(parse(text=paste0("x",strain,"_counts"))),transcript_lengths))
}
# clean up
rm(s,a,st,at,df)
Calculate TPM values:
# Function for calculating TPMs from the count tibbles
# generated above
#
calc_tpms <- function(counts) {
# caluclate the normalizing factor for each sample
# normFactor = sum of all length normalized transcript counts
normFactors <- counts %>%
mutate(normCount=sense_counts/length) %>%
group_by(tp) %>%
summarise(normFactor=1/sum(normCount))
# calculate the TPMs by joining it together with normFactors
# and multiplying the length corrected counts by these factors
# and by one million
abund <- inner_join(normFactors,counts) %>%
mutate(sense_tpm=(sense_counts/length)*normFactor*1e6,antisense_tpm=(antisense_counts/length)*normFactor*1e6) %>%
dplyr::select(transcript_id,sense_counts,antisense_counts,sense_tpm,antisense_tpm,tp)
return(abund)
}
# calculate the TPMs for each strain
x3d7_abund <- calc_tpms(x3d7_counts)
xhb3_abund <- calc_tpms(xhb3_counts)
xit_abund <- calc_tpms(xit_counts)
Let’s just make sure they all add up to a million as they should:
# sum up TPM values for each time point
x3d7_abund %>% group_by(tp) %>% summarise(total=sum(sense_tpm))
# A tibble: 7 × 2
tp total
<int> <dbl>
1 1 1e+06
2 2 1e+06
3 3 1e+06
4 4 1e+06
5 5 1e+06
6 6 1e+06
7 7 1e+06
xhb3_abund %>% group_by(tp) %>% summarise(total=sum(sense_tpm))
# A tibble: 7 × 2
tp total
<int> <dbl>
1 1 1e+06
2 2 1e+06
3 3 1e+06
4 4 1e+06
5 5 1e+06
6 6 1e+06
7 7 1e+06
xit_abund %>% group_by(tp) %>% summarise(total=sum(sense_tpm))
# A tibble: 7 × 2
tp total
<int> <dbl>
1 1 1e+06
2 2 1e+06
3 3 1e+06
4 4 1e+06
5 5 1e+06
6 6 1e+06
7 7 1e+06
Write these abundance values to output:
# Sense TPM tables
select(x3d7_abund,transcript_id,sense_tpm,tp) %>% spread(tp,sense_tpm) %>%
write_tsv(path="../output/transcript_abundance/htseq_3d7_sense_tpms.tsv")
select(xhb3_abund,transcript_id,sense_tpm,tp) %>% spread(tp,sense_tpm) %>%
write_tsv(path="../output/transcript_abundance/htseq_hb3_sense_tpms.tsv")
select(xit_abund,transcript_id,sense_tpm,tp) %>% spread(tp,sense_tpm) %>%
write_tsv(path="../output/transcript_abundance/htseq_it_sense_tpms.tsv")
# Antisense TPM tables
select(x3d7_abund,transcript_id,antisense_tpm,tp) %>% spread(tp,antisense_tpm) %>%
write_tsv(path="../output/transcript_abundance/htseq_3d7_antisense_tpms.tsv")
select(xhb3_abund,transcript_id,antisense_tpm,tp) %>% spread(tp,antisense_tpm) %>%
write_tsv(path="../output/transcript_abundance/htseq_hb3_antisense_tpms.tsv")
select(xit_abund,transcript_id,antisense_tpm,tp) %>% spread(tp,antisense_tpm) %>%
write_tsv(path="../output/transcript_abundance/htseq_it_antisense_tpms.tsv")
# Sense and antisense counts in long format
saveRDS(object=x3d7_abund,file="../output/transcript_abundance/htseq_3d7_abund.rds")
saveRDS(object=xhb3_abund,file="../output/transcript_abundance/htseq_hb3_abund.rds")
saveRDS(object=xit_abund,file="../output/transcript_abundance/htseq_it_abund.rds")
That’s it! Now we can use these files for future analyses.
sessionInfo()
R version 3.3.2 (2016-10-31)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: OS X El Capitan 10.11.6
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] rtracklayer_1.34.2 GenomicRanges_1.26.3 GenomeInfoDb_1.10.3
[4] IRanges_2.8.1 S4Vectors_0.12.1 BiocGenerics_0.20.0
[7] dplyr_0.5.0 purrr_0.2.2 readr_1.0.0
[10] tidyr_0.6.1 tibble_1.2 ggplot2_2.2.1
[13] tidyverse_1.1.1
loaded via a namespace (and not attached):
[1] SummarizedExperiment_1.4.0 reshape2_1.4.2
[3] haven_1.0.0 lattice_0.20-34
[5] colorspace_1.3-2 htmltools_0.3.5
[7] yaml_2.1.14 XML_3.98-1.5
[9] foreign_0.8-67 DBI_0.5-1
[11] BiocParallel_1.8.1 modelr_0.1.0
[13] readxl_0.1.1 plyr_1.8.4
[15] stringr_1.2.0 zlibbioc_1.20.0
[17] Biostrings_2.42.1 workflowr_0.3.0
[19] munsell_0.4.3 gtable_0.2.0
[21] rvest_0.3.2 psych_1.6.12
[23] evaluate_0.10 Biobase_2.34.0
[25] knitr_1.15.1 forcats_0.2.0
[27] broom_0.4.2 Rcpp_0.12.9
[29] scales_0.4.1 backports_1.0.5
[31] jsonlite_1.3 XVector_0.14.0
[33] Rsamtools_1.26.1 mnormt_1.5-5
[35] hms_0.3 digest_0.6.12
[37] stringi_1.1.2 grid_3.3.2
[39] rprojroot_1.2 tools_3.3.2
[41] bitops_1.0-6 magrittr_1.5
[43] lazyeval_0.2.0 RCurl_1.95-4.8
[45] Matrix_1.2-8 xml2_1.1.1
[47] lubridate_1.6.0 assertthat_0.1
[49] rmarkdown_1.3 httr_1.2.1
[51] R6_2.2.0 git2r_0.18.0
[53] GenomicAlignments_1.10.0 nlme_3.1-131
This R Markdown site was created with workflowr