不同的方法從gtf中提取相關基因組信息in R
Method One:
library(GenomicRanges)
library(GenomicFeatures)
library(annotatr)
makeTxDbFromGFF
txdb <- annotatr::makeTxDbFromGFF(gff_file, format="gtf")
GRanges(txdb)
ebg <- transcriptsBy(txdb, by="seqlevels")
anno_info <- transcriptsBy(txdb, by=c("gene", "exon", "cds",‘three_prime_utr‘,‘five_prime_utr‘), use.names=FALSE)
library(org.Mm.eg.db)
x = get(sprintf(‘org.Mm.egSYMBOL‘, ‘Mm10‘))
mapped_genes = mappedkeys(x)
eg2symbol = as.data.frame(x[mapped_genes])
eg2symbol$ensemble <- mapIds(org.Mm.eg.db,keys = eg2symbol$gene_id,keytype = "ENTREZID",column = "ENSEMBL")
print(eg2symbol)
Method Two:
source("https://bioconductor.org/biocLite.R")
biocLite(‘GenomicFeatures‘)
library(‘GenomicFeatures‘)
biocLite("GenomicRanges")
library(‘GenomicRanges‘)
library(rtracklayer)
library(GenomicAlignments)
library("AnnotationDbi")
library("GenomicFeatures")
test <- import(gtfFile, format = "gtf")
#Abstract the first rank exon
exon_1=subset(mcols(test),mcols(test)$exon_number=="1")
#Abstract reads from bam files
reads <- readGAlignments(‘./test.bam‘)
reads <- as(reads, "GRanges") #change the class from dataframe to GRanges
txdb = makeTxDbFromGFF(‘./Homo_sapiens.GRCh38.87.gtf‘,format = "gtf",circ_seqs = character())
#Abstract sites information
genes <- genes(txdb)
TS=transcripts(txdb)
EX=exons(txdb)
EX_hits<- findOverlaps(reads, EX)
hits <- findOverlaps(reads, genes)
Method Three:
library(plyr)
pfgtf <- file.path("./Homo_sapiens.GRCh38.87.gtf")
gtf <- read.table(pfgtf, head = F, sep = "\t", stringsAsFactors = F)
test <- data.frame(do.call(rbind, strsplit(gtf$V9, split = "[; ]")), stringsAsFactors=F)
valueFind <- function(x, type = "gene_id"){
for(i in 1:length(x)){
if(sum(x == type) == 0){
return(NA)
}else{
return(x[which(x == type)[1]+1])
}
}
}
gtf_infor <- data.frame(gene_id = as.vector(apply(test,1,valueFind)),
transcript_id = as.vector(apply(test,1,function(x){valueFind(x,type = "transcript_id")})),
gene_name = as.vector(apply(test,1,function(x){valueFind(x,type = "gene_name")})),
gene_biotype = as.vector(apply(test,1,function(x){valueFind(x,type = "gene_biotype")})),
exon_id = as.vector(apply(test,1,function(x){valueFind(x,type = "exon_id")})),
exon_number = as.vector(apply(test,1,function(x){valueFind(x,type = "exon_number")})), stringsAsFactors=F)
gtf_info <- cbind(gtf[,c(1,3,4,5,7)], gtf_infor)
gtf_exon_info <- gtf_info[gtf_info$V3 == "exon", ]
colnames(gtf_exon_info)[1:5] <- c("chr", "type", "start", "end", "strand")
gtf_exon_info$exon_number <- as.numeric(gtf_exon_info$exon_number)
gtf_exon_info$exon_loci <- paste(gtf_exon_info$chr,":", gtf_exon_info$start, "-", gtf_exon_info$end, "_", gtf_exon_info$strand, sep = "")
data_split <- dlply(gtf_exon_info, .variables = "gene_id")
first_exon <- lapply(data_split, function(x){
x[x$exon_number == 1, ]
})
biocLit(‘IRanges‘)
library(‘IRanges‘)
####Compile first exons into IRanges object
IRanges_first_exon = lapply(first_exon,function(x) IRanges(x[,3],x[,4]))
first_exon_overlap = lapply(IRanges_first_exon, function(x) findOverlaps(IRanges_reads, x))
####統計每個基因上第一個外顯子的位置可以與我找出來reads,相互overlap到的數目
overlap_summary = lapply(first_exon_overlap,function(x) summary(x)[1])
###找出有map上reads的外顯子個數,以及上面的reads數
exist_overlap_reads=unlist(overlap_summary)[unlist(overlap_summary)!=‘0’]
length(exist_overlap_reads)
every_gene_map_reads=as.numeric(exist_overlap_reads)
Method Four:
rtracklayer::readGFF(gtfFile)
importGTF(file , skip = 5 , nrow = -1, use.data.table = TRUE , level = ‘gene‘ , features = NULL , print.feature = FALSE)
不同的方法從gtf中提取相關基因組信息in R