從NCBI基因組資料中獲得cds,pep和geneID對應表
阿新 • • 發佈:2019-01-01
在做基因組相關分析時,我們常常需要從基因組中提取cds,並翻譯成相應的pep序列。此指令碼,以NCBI資料庫中標準的基因組序列檔案和對應的gff檔案為輸入檔案,快速獲得cds序列,pep序列,RNA,Protein和gene的對應關係表等相關檔案。
A perl script which deals with ncbi raw data,and from which get cds ,pep and gene,mRNA and protein ID list.
使用方法如下:
perl $0 gff_filegenomes_fa_file file_prefix
#! /usr/bin/perl -w
=head1 #######################################
=head1 name
grep_cds_pep_from_ncbi_genomes_datas.pl
=head1 description
deal with ncbi raw data,and from which get cds ,pep and gene,mRNA and protein ID list.
=head1 example
perl $0 ref_ConCri1.0_top_level.gff3.gz ccr_ref_ConCri1.0_chrUn.fa.gz mole
perl $0 gff_file genomes_fa_file file_prefix
=head1 author
original from Xiangfeng Li, [email protected]
##2014-4-19/21##
=head1 #######################################
=cut
use strict;
die `pod2text $0` unless @ARGV==3;
my ($gff,$fa,$prefix)[email protected];
##deal gff file##
$gff=~/gz$/ ? (open GFF,"gzip -cd $gff|"||die):(open GFF,$gff||die);
my (%mrna,%cds,%pep);
while(<GFF>){
chomp;
next if(/^#/);
my @p=split/\t/,$_;
my @q=split/;/,$p[8];
my ($rna,$pep,$nt,$gene);
my $chr=$p[0];
if($p[2] eq "mRNA"){
($rna=$q[0])=~s/ID=//;
($nt=$q[1])=~s/Name=//;
($gene=$q[-3])=~s/gene=//;
$mrna{$chr}{$rna}{strand}=$p[6];
$cds{$rna}=[$gene,$nt];
}
if($p[2] eq "CDS"){
($pep=$q[1])=~s/Name=//;
($rna=$q[2])=~s/Parent=//;
push @{$mrna{$chr}{$rna}{nt}},[$p[3],$p[4]];
$pep{$rna}=$pep;
}
}
close GFF;
##get id list##
my %anno;
my $id_gene_cds_pep=$prefix."_id_gene_cds_pep.lst";
open ID, ">",$id_gene_cds_pep||die;
foreach my $i(sort keys %pep){
if($cds{$i}){
my $out=join "\t",$i,$cds{$i}[0],$cds{$i}[1],$pep{$i};
print ID $out,"\n";
($anno{$i}=$out)=~s/\t/\|/g;
}
}
warn "create file:$id_gene_cds_pep\n";
close ID;
##deal fa file##
my %max;
$fa=~/gz$/ ? (open FA,"gzip -cd $fa|"||die):(open FA,$fa||die);
my $raw_cds=$prefix."_raw_cds";
open CDS1,">$raw_cds"||die;
my ($start,$end);
$/=">";<FA>;$/="\n";
while(<FA>){
my $name=$1 if(/(\S+)/);
my $info=(split/\|/,$name)[-1];
$/=">";
my $seq=<FA>;
$/="\n";
$seq=~s/>|\n+//g;
my $scaf=$mrna{$info};
foreach my $k(sort keys %$scaf){
next if(! exists $scaf->{$k}{nt});
my @ [email protected]{$scaf->{$k}{nt}};
my $strand=$$scaf{$k}{strand};
my $get;
@p=sort{$a->[0]<=>$b->[0]}@p;
my $loc1=$p[0][0];
my $loc2=$p[-1][1];
my ($get_len,$add,$out,$gene);
if(exists $anno{$k}){
$add=$anno{$k};
$gene=(split/\|/,$add)[1];
}else{next;}
foreach(@p){
($start,$end) [email protected]$_[0,1];
$get.=uc(substr($seq,$start-1,$end-$start+1));
}
if($strand eq "+"){
$get_len=length$get;
$get=~s/([A-Z]{50})/$1\n/g;
chop($get) unless($get_len%50);
$out=">$add LOC=$info :$loc1:$loc2:+ length=$get_len\n$get\n";
push @{$max{$gene}},[$get_len,$out];
print CDS1 $out;
}
if($strand eq "-"){
$get=&reverse_complement($get);
$get_len=length$get;
$get=~s/([A-Z]{50})/$1\n/g;
chop($get) unless($get_len%50);
$out=">$add LOC=$info :$loc1:$loc2:- length=$get_len\n$get\n";
push @{$max{$gene}},[$get_len,$out];
print CDS1 $out;
}
}
}
warn "create file:$raw_cds\n";
close FA;
close CDS1;
##get max transcript##
my $filter_cds=$prefix."_filter_cds";
open CDS2,">$filter_cds"||die;
my @a;
foreach my $j(keys %max){
my @[email protected]{$max{$j}};
@trans=sort{$a->[0] cmp $b->[0]}@trans;
push @a,$trans[-1][1];
}
my @a_new;
foreach(@a){
my $r=$1 if(/^>rna(\d+)/);
push @a_new,[$r,$_];
}
my @cds_sort=map{$_->[1]}
sort{$a->[0] <=> $b->[0]}@a_new;
print CDS2 $_ [email protected]_sort;
close CDS2;
warn "create file:$filter_cds\n";
##get raw pep sequences##
my $raw_pep=$prefix."_raw_pep";
open PEP1,">",$raw_pep||die;
my @raw_pep=&cds2pep($raw_cds);
print PEP1 $_ [email protected]_pep;
close PEP1;
warn "create file:$raw_pep\n";
##get filter pep sequences##
my $filter_pep=$prefix."_filter_pep";
open PEP2,">$filter_pep"||die;
my @filter_pep=&cds2pep($filter_cds);
print PEP2 $_ [email protected]_pep;
close PEP2;
warn "create file:$filter_pep\n";
##add label for cds and pep of filter##
my $label=uc($prefix);
open IN1,$filter_cds||die;
my $filter_cds_label=$prefix."_filter_cds_label";
open OUT1,">",$filter_cds_label||die;
while(<IN1>){
chomp;
if(/^>/){
my @a=split/\|/,$_,2;
my $name=$a[0]."_$label";
print OUT1"$name |$a[1]\n";
}else{print OUT1 "$_\n";}
}
close IN1;
close OUT1;
warn "create file:$filter_cds_label\n";
open IN2,$filter_pep||die;
my $filter_pep_label=$prefix."_filter_pep_label";
open OUT2,">",$filter_pep_label||die;
while(<IN2>){
chomp;
if(/^>/){
my @a=split/\|/,$_,2;
my $name=$a[0]."_$label";
print OUT2 "$name |$a[1]\n";
}else{print OUT2 "$_\n";}
}
close IN2;
close OUT2;
warn "create file:$filter_pep_label\n";
##timing##
my $time=times;
my $time_out=sprintf "%.2f",$time/60;
print "##########\nElapsed Time :$time_out mins\n##########\n";
##subroutine##
sub reverse_complement{
my ($seq)=shift;
$seq=reverse$seq;
$seq=~tr/AaGgCcTt/TtCcGgAa/;
return $seq;
}
##subroutine##
sub cds2pep{
my $file=shift;
my %code = (
"standard" =>
{
'GCA' => 'A', 'GCC' => 'A', 'GCG' => 'A', 'GCT' => 'A', # Alanine
'TGC' => 'C', 'TGT' => 'C', # Cysteine
'GAC' => 'D', 'GAT' => 'D', # Aspartic Aci
'GAA' => 'E', 'GAG' => 'E', # Glutamic Aci
'TTC' => 'F', 'TTT' => 'F', # Phenylalanin
'GGA' => 'G', 'GGC' => 'G', 'GGG' => 'G', 'GGT' => 'G', # Glycine
'CAC' => 'H', 'CAT' => 'H', # Histidine
'ATA' => 'I', 'ATC' => 'I', 'ATT' => 'I', # Isoleucine
'AAA' => 'K', 'AAG' => 'K', # Lysine
'CTA' => 'L', 'CTC' => 'L', 'CTG' => 'L', 'CTT' => 'L', 'TTA' => 'L', 'TTG' => 'L', # Leucine
'ATG' => 'M', # Methionine
'AAC' => 'N', 'AAT' => 'N', # Asparagine
'CCA' => 'P', 'CCC' => 'P', 'CCG' => 'P', 'CCT' => 'P', # Proline
'CAA' => 'Q', 'CAG' => 'Q', # Glutamine
'CGA' => 'R', 'CGC' => 'R', 'CGG' => 'R', 'CGT' => 'R', 'AGA' => 'R', 'AGG' => 'R', # Arginine
'TCA' => 'S', 'TCC' => 'S', 'TCG' => 'S', 'TCT' => 'S', 'AGC' => 'S', 'AGT' => 'S', # Serine
'ACA' => 'T', 'ACC' => 'T', 'ACG' => 'T', 'ACT' => 'T', # Threonine
'GTA' => 'V', 'GTC' => 'V', 'GTG' => 'V', 'GTT' => 'V', # Valine
'TGG' => 'W', # Tryptophan
'TAC' => 'Y', 'TAT' => 'Y', # Tyrosine
'TAA' => 'U', 'TAG' => 'U', 'TGA' => 'U' # Stop
}
## more translate table could be added here in future
## more translate table could be added here in future
## more translate table could be added here in future
);
open IN,$file||die;
$/=">";<IN>;$/="\n";
my @results_set;
while(<IN>){
my $info=$_;
my @a=split/\s+/,$info;
$/=">";my $seq=<IN>;$/="\n";
$seq=~s/\n|>//g;
my $len=length$seq;
my $info_out=join " ",@a[0..($#a-1)];
my ($pep_out,$triplet);
for(my $i=0;$i<$len;$i+=3){
$triplet=substr($seq,$i,3);
next if(length$triplet!=3);
if(exists $code{standard}{$triplet}){
$pep_out.=$code{standard}{$triplet};
}else{$pep_out.="X"}
}
$pep_out=~s/U$// if($pep_out=~/U$/);
my $pep_len=length$pep_out;
$pep_out=~s/([A-Z]{50})/$1\n/g;
chop($pep_out) unless($pep_len%50);
my $results= ">$info_out length=$pep_len\n$pep_out\n";
push @results_set,$results;
}
return @results_set;
}
__END__