利用R語言對RNA-Seq進行探索分析與差異表達分析
介紹
本文參考 bioconductor 中RNA-Seq workflow: gene-level exploratory analysis and differential expression並對其根據需要進行了增減。
試驗資料
資料來源
Himes BE, Jiang X, Wagner P, Hu R, Wang Q, Klanderman B, Whitaker RM, Duan Q, Lasky-Su J, Nikolos C, Jester W, Johnson M, Panettieri R Jr, Tantisira KG, Weiss ST, Lu Q. “RNA-Seq Transcriptome Profiling Identifies CRISPLD2 as a Glucocorticoid Responsive Gene that Modulates Cytokine Function in Airway Smooth Muscle Cells.” PLoS One. 2014 Jun 13;9(6):e99625. PMID: 24926665. GEO: GSE52778.
在這個RNA-Seq試驗中,採用了4種呼吸道平滑肌肉細胞(airway smooth muscle cells),每種細胞均有 地塞米松治療、非治療兩類。共計8個樣本,儲存在 airway 包中。
原始資料的處理
高通量測序資料常採用 FASTQ 格式來保 存所測的鹼基讀段和質量分數。如圖 所示,FASTQ 格式以測序讀段為單位存 儲,每條讀段佔 4 行,其中第一行和的第三行由檔案識別標誌和讀段名(ID)組成(第一行以“@”開頭而第三行以“+”開頭;第三行中 ID 可以省略,但“+”不能省 略),第二行為鹼基序列,第四行為各鹼基所對應的測序質量分數序列。
採用 tophat/bowtie2 將原始資料fastq對映到基因組序列,得到bam檔案;
此處我們採用airway 自帶的bam檔案。
載入 airway 包, 並利用自帶bam檔案
library("airway")
dir <- system.file("extdata", package="airway", mustWork=TRUE)
list.files(dir) # 檔案列表
## [1] "GSE52778_series_matrix.txt" "Homo_sapiens.GRCh37.75_subset.gtf"
## [3] "sample_table.csv" "SraRunInfo_SRP033351.csv"
## [5] "SRR1039508_subset.bam" "SRR1039508_subset.bam.bai"
## [7] "SRR1039509_subset.bam" "SRR1039512_subset.bam"
## [9] "SRR1039513_subset.bam" "SRR1039516_subset.bam"
## [11] "SRR1039517_subset.bam" "SRR1039520_subset.bam"
## [13] "SRR1039521_subset.bam"
csvfile <- file.path(dir,"sample_table.csv")
(sampleTable <- read.csv(csvfile,row.names=1)) # 獲取樣本資訊
## SampleName cell dex albut Run avgLength Experiment Sample BioSample
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126 SRX384345 SRS508568 SAMN02422669
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346 SRS508567 SAMN02422675
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126 SRX384349 SRS508571 SAMN02422678
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87 SRX384350 SRS508572 SAMN02422670
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120 SRX384353 SRS508575 SAMN02422682
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126 SRX384354 SRS508576 SAMN02422673
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101 SRX384357 SRS508579 SAMN02422683
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98 SRX384358 SRS508580 SAMN02422677
filenames <- file.path(dir, paste0(sampleTable$Run, "_subset.bam")) # 提取bam檔案
獲取bam資料
library("Rsamtools")
filenames
## [1] "/Library/Frameworks/R.framework/Versions/library/airway/extdata/SRR1039508_subset.bam"
## [2] "/Library/Frameworks/R.framework/Versions/library/airway/extdata/SRR1039509_subset.bam"
## [3] "/Library/Frameworks/R.framework/Versions/library/airway/extdata/SRR1039512_subset.bam"
## [4] "/Library/Frameworks/R.framework/Versions/library/airway/extdata/SRR1039513_subset.bam"
## [5] "/Library/Frameworks/R.framework/Versions/library/airway/extdata/SRR1039516_subset.bam"
## [6] "/Library/Frameworks/R.framework/Versions/library/airway/extdata/SRR1039517_subset.bam"
## [7] "/Library/Frameworks/R.framework/Versions/library/airway/extdata/SRR1039520_subset.bam"
## [8] "/Library/Frameworks/R.framework/Versions/library/airway/extdata/SRR1039521_subset.bam"
bamfiles <- BamFileList(filenames, yieldSize=2000000) # 將bam檔案放入列表,yieldSize 表示每次被讀取的記錄數
seqinfo(bamfiles) # 序列的基本資訊
## Seqinfo object with 84 sequences from an unspecified genome:
## seqnames seqlengths isCircular genome
## 1 249250621 <NA> <NA>
## 10 135534747 <NA> <NA>
## 11 135006516 <NA> <NA>
## 12 133851895 <NA> <NA>
## 13 115169878 <NA> <NA>
## ... ... ... ...
## GL000210.1 27682 <NA> <NA>
## GL000231.1 27386 <NA> <NA>
## GL000229.1 19913 <NA> <NA>
## GL000226.1 15008 <NA> <NA>
## GL000207.1 4262 <NA> <NA>
seqlevels(bamfiles) # 所有的染色體名稱, GLxxxxxx.x表示genomic contig
## [1] "1" "10" "11" "12" "13" "14" "15"
## [8] "16" "17" "18" "19" "2" "20" "21"
## [15] "22" "3" "4" "5" "6" "7" "8"
## [22] "9" "MT" "X" "Y" "GL000192.1" "GL000225.1" "GL000194.1"
## [29] "GL000193.1" "GL000200.1" "GL000222.1" "GL000212.1" "GL000195.1" "GL000223.1" "GL000224.1"
## [36] "GL000219.1" "GL000205.1" "GL000215.1" "GL000216.1" "GL000217.1" "GL000199.1" "GL000211.1"
## [43] "GL000213.1" "GL000220.1" "GL000218.1" "GL000209.1" "GL000221.1" "GL000214.1" "GL000228.1"
## [50] "GL000227.1" "GL000191.1" "GL000208.1" "GL000198.1" "GL000204.1" "GL000233.1" "GL000237.1"
## [57] "GL000230.1" "GL000242.1" "GL000243.1" "GL000241.1" "GL000236.1" "GL000240.1" "GL000206.1"
## [64] "GL000232.1" "GL000234.1" "GL000202.1" "GL000238.1" "GL000244.1" "GL000248.1" "GL000196.1"
## [71] "GL000249.1" "GL000246.1" "GL000203.1" "GL000197.1" "GL000245.1" "GL000247.1" "GL000201.1"
## [78] "GL000235.1" "GL000239.1" "GL000210.1" "GL000231.1" "GL000229.1" "GL000226.1" "GL000207.1"
匯入基因組特徵(註釋)
eg. 外顯子的染色體位置, 基因的起始、終止位點
library("GenomicFeatures")
gtffile <- file.path(dir,"Homo_sapiens.GRCh37.75_subset.gtf")
(txdb <- makeTxDbFromGFF(gtffile, format="gtf"))
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: /Library/Frameworks/R.framework/Versions/library/airway/extdata/Homo_sapiens.GRCh37.75_subset.gtf
## # Organism: NA
## # miRBase build ID: NA
## # Genome: NA
## # transcript_nrow: 65
## # exon_nrow: 279
## # cds_nrow: 158
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2015-06-17 17:40:06 +0800 (Wed, 17 Jun 2015)
## # GenomicFeatures version at creation time: 1.20.1
## # RSQLite version at creation time: 1.0.0
## # DBSCHEMAVERSION: 1.1
(genes <- exonsBy(txdb, by="gene"))
## GRangesList object of length 20:
## $ENSG00000009724
## GRanges object with 18 ranges and 2 metadata columns:
## seqnames ranges strand | exon_id exon_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] 1 [11086580, 11087705] - | 98 ENSE00000818830
## [2] 1 [11090233, 11090307] - | 99 ENSE00000472123
## [3] 1 [11090805, 11090939] - | 100 ENSE00000743084
## [4] 1 [11094885, 11094963] - | 101 ENSE00000743085
## [5] 1 [11097750, 11097868] - | 103 ENSE00003520086
## ... ... ... ... ... ... ...
## [14] 1 [11106948, 11107176] - | 111 ENSE00003467404
## [15] 1 [11106948, 11107176] - | 112 ENSE00003489217
## [16] 1 [11107260, 11107280] - | 113 ENSE00001833377
## [17] 1 [11107260, 11107284] - | 114 ENSE00001472289
## [18] 1 [11107260, 11107290] - | 115 ENSE00001881401
##
## ...
## <19 more elements>
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
seqlevels(genes) # 染色體的名字
## [1] "1"
染色體的名稱 seqlevels(bamfiles)
與 seqlevels(genes)
應該保持一致, 特別留意是否含有 “chr”, 要麼都有”chr”, 要麼都沒有。
基因計數
library("GenomicAlignments")
se <- summarizeOverlaps(features = genes, reads = bamfiles,
mode = "Union", # 讀段覆蓋的模式
singleEnd=FALSE, #雙末端 not 單末端
ignore.strand=TRUE,# True 表示忽略±鏈的限制
fragments=TRUE ) # 只應用於雙末端測序,true表示非成對的對端應該被計數
class(se) # 得到 SummarizedExperiment 資料,可用於後續計算
## [1] "SummarizedExperiment"
## attr(,"package")
## [1] "GenomicRanges"
上圖顯示的是SummarizedExperiment類(以及他的子類DESeqDataSet)的佈局, 粉紅色 assay(se)
表示實際的資料, 每行為一個基因,每列為一個樣本;
colData
表示樣本的具體資訊,隨後我們會對它進行填充;rowRanges
表示每一個基因的資訊。具體如下
se
## class: SummarizedExperiment
## dim: 20 8
## exptData(0):
## assays(1): counts
## rownames(20): ENSG00000009724 ENSG00000116649 ... ENSG00000271794 ENSG00000271895
## rowRanges metadata column names(0):
## colnames(8): SRR1039508_subset.bam SRR1039509_subset.bam ... SRR1039520_subset.bam
## SRR1039521_subset.bam
## colData names(0):
head(assay(se))
## SRR1039508_subset.bam SRR1039509_subset.bam SRR1039512_subset.bam
## ENSG00000009724 38 28 66
## ENSG00000116649 1004 1255 1122
## ENSG00000120942 218 256 233
## ENSG00000120948 2751 2080 3353
## ENSG00000171819 4 50 19
## ENSG00000171824 869 1075 1115
## SRR1039513_subset.bam SRR1039516_subset.bam SRR1039517_subset.bam
## ENSG00000009724 24 42 41
## ENSG00000116649 1313 1100 1879
## ENSG00000120942 252 269 465
## ENSG00000120948 1614 3519 3716
## ENSG00000171819 543 1 10
## ENSG00000171824 1051 944 1405
## SRR1039520_subset.bam SRR1039521_subset.bam
## ENSG00000009724 47 36
## ENSG00000116649 745 1536
## ENSG00000120942 207 400
## ENSG00000120948 2220 1990
## ENSG00000171819 14 1067
## ENSG00000171824 748 1590
colSums(assay(se))
## SRR1039508_subset.bam SRR1039509_subset.bam SRR1039512_subset.bam SRR1039513_subset.bam
## 6478 6501 7699 6801
## SRR1039516_subset.bam SRR1039517_subset.bam SRR1039520_subset.bam SRR1039521_subset.bam
## 8009 10849 5254 9168
colData(se)
## DataFrame with 8 rows and 0 columns
rowRanges(se)
## GRangesList object of length 20:
## $ENSG00000009724
## GRanges object with 18 ranges and 2 metadata columns:
## seqnames ranges strand | exon_id exon_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] 1 [11086580, 11087705] - | 98 ENSE00000818830
## [2] 1 [11090233, 11090307] - | 99 ENSE00000472123
## [3] 1 [11090805, 11090939] - | 100 ENSE00000743084
## [4] 1 [11094885, 11094963] - | 101 ENSE00000743085
## [5] 1 [11097750, 11097868] - | 103 ENSE00003520086
## ... ... ... ... ... ... ...
## [14] 1 [11106948, 11107176] - | 111 ENSE00003467404
## [15] 1 [11106948, 11107176] - | 112 ENSE00003489217
## [16] 1 [11107260, 11107280] - | 113 ENSE00001833377
## [17] 1 [11107260, 11107284] - | 114 ENSE00001472289
## [18] 1 [11107260, 11107290] - | 115 ENSE00001881401
##
## ...
## <19 more elements>
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
str(metadata(rowRanges(se)))
## List of 1
## $ genomeInfo:List of 14
## ..$ Db type : chr "TxDb"
## ..$ Supporting package : chr "GenomicFeatures"
## ..$ Data source : chr "/Library/Frameworks/R.framework/Versions/library/airway/extdata/Homo_sapiens.GRCh37.75_subset.gtf"
## ..$ Organism : chr NA
## ..$ miRBase build ID : chr NA
## ..$ Genome : chr NA
## ..$ transcript_nrow : chr "65"
## ..$ exon_nrow : chr "279"
## ..$ cds_nrow : chr "158"
## ..$ Db created by : chr "GenomicFeatures package from Bioconductor"
## ..$ Creation time : chr "2015-06-17 17:40:06 +0800 (Wed, 17 Jun 2015)"
## ..$ GenomicFeatures version at creation time: chr "1.20.1"
## ..$ RSQLite version at creation time : chr "1.0.0"
## ..$ DBSCHEMAVERSION : chr "1.1"
(colData(se) <- DataFrame(sampleTable)) # 填充樣本的具體資訊,方便後續分組,尋找差異基因
## DataFrame with 8 rows and 9 columns
## SampleName cell dex albut Run avgLength Experiment Sample
## <factor> <factor> <factor> <factor> <factor> <integer> <factor> <factor>
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126 SRX384345 SRS508568
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346 SRS508567
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126 SRX384349 SRS508571
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87 SRX384350 SRS508572
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120 SRX384353 SRS508575
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126 SRX384354 SRS508576
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101 SRX384357 SRS508579
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98 SRX384358 SRS508580
## BioSample
## <factor>
## SRR1039508 SAMN02422669
## SRR1039509 SAMN02422675
## SRR1039512 SAMN02422678
## SRR1039513 SAMN02422670
## SRR1039516 SAMN02422682
## SRR1039517 SAMN02422673
## SRR1039520 SAMN02422683
## SRR1039521 SAMN02422677
注意:此處得到的資料需要採用EDSeq2包進行差異分析,所以不對資料進行標準化,切記。
差異表達基因分析
我們採用 DESeq2 包進行,差異表達基因的分析
# 此步採用 airway 包自帶的se資料進行後續操作,可以忽略。如果沒有進行上面的步驟也可以直接採用下面的資料進行後續操作。
data("airway")
se <- airway
library("DESeq2")
dds <- DESeqDataSet(se, design = ~ cell + dex) # design 引數為 formula,此處為cell和dex兩個因素,~ cell + dex表示我們想控制cell研究dex的影響。
採用DESeqDataSetFromMatrix函式從matrix中獲取資料
countdata <- assay(se) # 可以根據自己的需要填充自己的資料(matrix格式),這裡以assay(se)為例
class(countdata)
head(countdata)
coldata <- colData(se)
(ddsMat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~ cell + dex))
dds$dex <- relevel(dds$dex, "untrt") # 將 untrt 定義為dex因素的第一水平,隨後的foldchange 將採用 trt/untrt
dds <- DESeq(dds)
(res <- results(dds)) # 得到結果,可以根據padj來挑選合適的差異表達基因,log2FoldChange來確定基因上調還是下調,pvalue的校正採用了Benjamini-Hochberg方法,具體見 ?p.adjust
## log2 fold change (MAP): dex trt vs untrt
## Wald test p-value: dex trt vs untrt
## DataFrame with 20 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000009724 41.75438 -0.9605961 0.25246721 -3.804835 1.418988e-04 3.783967e-04
## ENSG00000116649 1222.28667 0.2071285 0.07112287 2.912263 3.588209e-03 7.176417e-03
## ENSG00000120942 280.89869 0.1403078 0.10305342 1.361505 1.733540e-01 2.311387e-01
## ENSG00000120948 2698.77923 -0.7665547 0.14936634 -5.132044 2.866120e-07 1.146448e-06
## ENSG00000171819 188.08482 4.3627872 0.57110974 7.639140 2.186777e-14 1.749421e-13
## ... ... ... ... ... ... ...
## ENSG00000238199 0.3913511 -0.3733789 1.6308428 -0.2289484 0.81890903 NA
## ENSG00000253086 0.1518861 -1.0350645 1.5729023 -0.6580603 0.51049939 NA
## ENSG00000264181 0.0000000 NA NA NA NA NA
## ENSG00000271794 0.0000000 NA NA NA NA NA
## ENSG00000271895 34.9593217 -0.5451150 0.2920692 -1.8663899 0.06198683 0.09917893
mcols(res, use.names=TRUE)
## DataFrame with 6 rows and 2 columns
## type description
## <character> <character>
## baseMean intermediate mean of normalized counts for all samples
## log2FoldChange results log2 fold change (MAP): dex trt vs untrt
## lfcSE results standard error: dex trt vs untrt
## stat results Wald statistic: dex trt vs untrt
## pvalue results Wald test p-value: dex trt vs untrt
## padj results BH adjusted p-values
summary(res)
##
## out of 16 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 2, 12%
## LFC < 0 (down) : 3, 19%
## outliers [1] : 0, 0%
## low counts [2] : 8, 50%
## (mean count < 12.1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
儲存資料
write.csv(res, file = '/your/path/')
sessionInfo
sessionInfo()
## R version 3.2.0 (2015-04-16)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.3 (Yosemite)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 graphics grDevices utils datasets stats methods base
##
## other attached packages:
## [1] GenomicAlignments_1.4.1 GenomicFeatures_1.20.1 AnnotationDbi_1.30.1
## [4] Biobase_2.28.0 knitr_1.10.5 BiocStyle_1.6.0
## [7] Rsamtools_1.20.4 Biostrings_2.36.1 XVector_0.8.0
## [10] airway_0.102.0 DESeq2_1.8.1 RcppArmadillo_0.5.200.1.0
## [13] Rcpp_0.11.6 GenomicRanges_1.20.5 GenomeInfoDb_1.4.0
## [16] IRanges_2.2.4 S4Vectors_0.6.0 BiocGenerics_0.14.0
## [19] readr_0.1.1 sqldf_0.4-10 RSQLite_1.0.0
## [22] DBI_0.3.1 gsubfn_0.6-6 proto_0.3-10
## [25] dplyr_0.4.1 plyr_1.8.3
##
## loaded via a namespace (and not attached):
## [1] splines_3.2.0 Formula_1.2-1 assertthat_0.1 latticeExtra_0.6-26
## [5] yaml_2.1.13 lattice_0.20-31 chron_2.3-45 digest_0.6.8
## [9] RColorBrewer_1.1-2 colorspace_1.2-6 htmltools_0.2.6 XML_3.98-1.2
## [13] biomaRt_2.24.0 genefilter_1.50.0 zlibbioc_1.14.0 xtable_1.7-4
## [17] snow_0.3-13 scales_0.2.5 BiocParallel_1.2.3 annotate_1.46.0
## [21] ggplot2_1.0.1 nnet_7.3-9 survival_2.38-1 magrittr_1.5
## [25] evaluate_0.7 MASS_7.3-40 foreign_0.8-63 tools_3.2.0
## [29] formatR_1.2 stringr_1.0.0.9000 munsell_0.4.2 locfit_1.5-9.1
## [33] cluster_2.0.1 lambda.r_1.1.7 futile.logger_1.4.1 grid_3.2.0
## [37] RCurl_1.95-4.6 bitops_1.0-6 tcltk_3.2.0 rmarkdown_0.7
## [41] gtable_0.1.2 reshape2_1.4.1 gridExtra_0.9.1 rtracklayer_1.28.4
## [45] Hmisc_3.16-0 futile.options_1.0.0 stringi_0.4-1 geneplotter_1.46.0
## [49] rpart_4.1-9 acepack_1.3-3.3
library(knitr)
knit('/Users/lipidong/baiduyun/work/RFile/MarkDown/funSet.Rmd', output = '~/learn/blog/_posts/2015-06-17-RNA-Seq.md')