perl novel可變剪接識別(2)
阿新 • • 發佈:2019-02-12
博主其實對未知的可變剪接分類有些困惑,但想了很久才使用了一項比較複雜的演算法,接下來並不是分類,而是先轉換資料庫,因為ensembl|gencode資料庫的格式並不能滿足博主的需求,轉換後更能方便地處理接下來的工作:
#!/usr/bin/env perl use warnings; use strict; my (%gene); open GTF, $ARGV[0] or die $!; while(<GTF>) { chomp; next if(/^#/); my @tmp = split; my ($gid) = $_ =~ /gene_id "([^;]+)";/; #匹配基因名稱 if($_ =~ /gene_type "protein_coding";/) #分coding和noncoding分別處理,因為noncoding沒有UTR結構,不存在UTR的可變剪接 { if($tmp[2] =~ /exon/) { push @{$gene{$tmp[0]}{$gid}{$tmp[6]}}, "$tmp[3],$tmp[4],1"; } }else{ if($tmp[2] =~ /exon/) { push @{$gene{$tmp[0]}{$gid}{$tmp[6]}}, "$tmp[3],$tmp[4],0"; } } } close GTF; open OUT, ">$ARGV[0].myformat" or die $!; foreach my $c(keys %gene) { foreach my $g(keys %{$gene{$c}}) { foreach my $ps(keys %{$gene{$c}{$g}}) { if($ps eq '+') { my ($s, $e, $type); if(@{$gene{$c}{$g}{$ps}} eq 1) { ($s,$e, $type) = (split /,/, $gene{$c}{$g}{$ps}->[0])[0,1,2]; }else{ $s = (split /,/, $gene{$c}{$g}{$ps}->[0])[0]; $e = (split /,/, $gene{$c}{$g}{$ps}->[-1])[1]; $type = (split /,/, $gene{$c}{$g}{$ps}->[0])[2]; } my (@start, @end); foreach my $loci(@{$gene{$c}{$g}{$ps}}) { push @start, (split /,/, $loci)[0]; push @end, (split /,/, $loci)[1]; } my $ss = join ",", @start; my $ee = join ",", @end; print OUT join "\t", $c, $g, "$s-$e", $ps, $type, $ss, $ee, "\n"; }else{ my @ps_a = reverse @{$gene{$c}{$g}{$ps}}; my ($s, $e, $type); if(@ps_a eq 1) { ($s,$e,$type) = (split /,/, $ps_a[0])[0,1,2]; }else{ $s = (split /,/, $ps_a[0])[0]; $e = (split /,/, $ps_a[-1])[1]; $type = (split /,/, $ps_a[0])[2]; } my (@start, @end); foreach my $loci(@ps_a) { push @start, (split /,/, $loci)[0]; push @end, (split /,/, $loci)[1]; } my $ss = join ",", @start; my $ee = join ",", @end; print OUT join "\t", $c, $g, "$s-$e", $ps, $type, $ss, $ee, "\n"; } } } }
這個格式類似ucsc的refseq格式,當然只是類似而已,而博主所需求的是gene型別和exon起始與終止而已。
說到資料庫的轉換,博主還想起來一件事情,就是refseq轉gff格式,其實兩者差的有些多。
有個妹子寫了一個轉換的,還不錯,展示的內容非常的詳細想要啥結果都可以,當然在這之中博主也貢獻了一點功勞,嘿嘿!就拿來在這展示一下吧:
#!perl -w use strict; die "Usage : perl $0 <in.refGene.lst> <out.gff>" unless (@ARGV == 2); my ($in, $out) = @ARGV; my($pre, $insert, @inserts, $utr, $utr_o, $i, $j, $cds, $nm, $chr, $direction, $start_exon, $end_exon, $start_cds, $end_cds, $cds_num, $start, $end, $tmp, $gene, @starts, @ends, $up, $down); if ($in =~ /\.gz/){open IN, " gzip -dc $in | " || die $!;} else{open IN, $in || die $!;} if ($out =~ /\.gz/){open OUT, "| gzip > $out" || die $!;} else {open OUT , "> $out" || die $!;} while (<IN>){ chomp; ($nm, $chr, $direction, $start_exon, $end_exon, $start_cds, $end_cds, $cds_num, $start, $end, $tmp, $gene, $insert) = (split)[1..12,15]; if ($nm =~ /NM/){ $pre = 'mRNA'; }else{ $pre = 'ncRNA'; } $start =~ s/,$//; $end =~ s/,$//; $insert =~ s/,$//; $insert =~ s/\-1/\./g; if ($cds_num > 1){ @starts = split /,/, $start; @ends = split /,/, $end; @inserts = split /,/, $insert; }else{ @starts = ($start); @ends = ($end); @inserts = ($insert); } print OUT "$chr\trefGene\t$pre\t$start_exon\t$end_exon\t.\t$direction\t.\tID=$nm; name=$gene;\n"; if ($direction eq '+'){ $utr = 5; $utr_o = 3; #print OUT "$chr\trefGene\t5-UTR\t$start_exon\t",$start_cds-1,"\t.\t$direction\t.\tParent=$nm;\n"; } else { $utr = 3; $utr_o = 5; #print OUT "$chr\trefGene\t3-UTR\t$start_exon\t",$start_cds-1,"\t.\t$direction\t.\tParent=$nm;\n"; } for ($i = 0; $i < @starts; $i ++){ print OUT "$chr\trefGene\tintron\t",$ends[$i-1]+1,"\t",$starts[$i]-1,"\t.\t$direction\t.\tParent=$nm;\n" if ($i > 0); if ($pre eq 'ncRNA'){ print OUT "$chr\trefGene\tCDS\t$starts[$i]\t$ends[$i]\t.\t$direction\t$inserts[$i]\tParent=$nm;\n"; next; } if ($ends[$i] < $start_cds){ print OUT "$chr\trefGene\t$utr-UTR\t$starts[$i]\t$ends[$i]\t.\t$direction\t$inserts[$i]\tParent=$nm;\n"; }elsif($starts[$i] < $start_cds and $ends[$i] > $start_cds){ print OUT "$chr\trefGene\t$utr-UTR\t$starts[$i]\t", $start_cds - 1, "\t.\t$direction\t.\tParent=$nm;\n"; print OUT "$chr\trefGene\tCDS\t$start_cds\t$ends[$i]\t.\t$direction\t$inserts[$i]\tParent=$nm;\n"; }elsif($starts[$i] >= $start_cds and $ends[$i] <= $end_cds){ print OUT "$chr\trefGene\tCDS\t$starts[$i]\t$ends[$i]\t.\t$direction\t$inserts[$i]\tParent=$nm;\n"; }elsif($starts[$i] < $end_cds and $ends[$i] > $end_cds){ print OUT "$chr\trefGene\tCDS\t$starts[$i]\t$end_cds\t.\t$direction\t$inserts[$i]\tParent=$nm;\n"; print OUT "$chr\trefGene\t$utr_o-UTR\t", $end_cds + 1, "\t$ends[$i]\t.\t$direction\t.\tParent=$nm;\n"; }else{ print OUT "$chr\trefGene\t$utr_o-UTR\t$starts[$i]\t$ends[$i]\t.\t$direction\t$inserts[$i]\tParent=$nm;\n"; } } } close IN; close OUT;
上面是正常的gff格式,下面將新增一些別的東西:
#!perl -w use strict; die "Usage : perl $0 <in.file> <out.file> " if (@ARGV > 2); my ($in, $out) = @ARGV; $in ||= 'refGene.sort.2.gz'; $out ||= 'refGene.ann.gz'; if($in =~ /\.gz/){ open IN, "gzip -dc $in |" || die $!; }else{ open IN, $in || die $!; } if ($out =~ /\.gz/){ open OUT, "| gzip > $out " || die $!; }else{ open OUT, "> $out" || die $!; } my (@tmps, $chr, $ref, $region, $start, $end, $num, $dot, $id, $count_cds, $count_intron, $gene, $nm); my (%infos, %genes); my @chrs = (1..22, 'X', 'Y'); $/ = "\n>"; chomp (my $line = <IN>); @tmps = split /\n/, $line; my ($chr_b, $end_b, $orientation, $id_b) = $tmps[0] =~ /(chr\w+)\t\w+\t\w+RNA\t\d+\t(\d+)\t\.\t([\+\-])\t\.\t(ID=\S+; name=\S+)$/; $tmps[0] =~ s/>//; print OUT "$tmps[0]\n"; @tmps[1..$#tmps] = &add_num ($orientation, @tmps[1..$#tmps]); print OUT join "\n", @tmps[1..$#tmps]; print OUT "\n"; while (<IN>){ chomp; @tmps = split /\n/, $_; ($chr, $start, $end, $orientation, $id) = $tmps[0] =~ /(chr\w+)\t\w+\t\w+RNA\t(\d+)\t(\d+)\t\.\t([\+\-])\t\.\t(ID=\S+; name=\S+)$/; if ($chr eq $chr_b and $end_b < $start){ print OUT "$chr\trefGene\tintergenic\t",$end_b + 1, "\t", $start - 1, "\t.\t.\t.\t$id_b|$id\n"; } ($chr_b, $end_b ,$id_b) = ($chr, $end, $id); $tmps[0] =~ s/>//; print OUT "$tmps[0]\n"; @tmps[1..$#tmps] = &add_num ($orientation, @tmps[1..$#tmps]); print OUT join "\n", @tmps[1..$#tmps]; print OUT "\n"; } close IN; close OUT; #$/ = '\n'; sub add_num { my @tmps = @_; my ($count_cds, $count_intron) = (0, 0); if ($tmps[0] eq '+'){ for my $tmp (@tmps[1..$#tmps]){ if ($tmp =~ /CDS/){ $count_cds ++; $tmp =~ s/\.\t\+/$count_cds\t\+/; #$count_cds ++; }elsif($tmp =~ /intron/){ $count_intron ++; $tmp =~ s/\.\t\+/$count_intron\t\+/; #$count_intron ++; } } return @tmps[1..$#tmps]; } #return @tmps[1..$#tmps]; for my $tmp (@tmps[1..$#tmps]){ if ($tmp =~ /CDS/){ $count_cds ++; }elsif($tmp =~ /intron/){ $count_intron ++; } } for my $tmp (@tmps[1..$#tmps]){ if ($tmp =~ /CDS/){ $tmp =~ s/\.\t\-/$count_cds\t\-/; $count_cds --; }elsif($tmp =~ /intron/){ $tmp =~ s/\.\t\-/$count_intron\t\-/; $count_intron --; } } return @tmps[1..$#tmps]; }
指令碼如下:
less refGene.txt.gz | sort -k3,3 -k5,5n |gzip > refGene.txt.sort.gz
perl format.pl refGene.txt.sort.gz refGene.gff.gz
less refGene.gff.gz |perl -lane 'if($F[2] =~ /RNA/){$F[0] = ">$F[0]";}print join "\t",@F[0..7], "$F[8] $F[9]";' |gzip > refGene.gff.2.gz
perl ann.sort.pl refGene.gff.2.gz refGene.ann.gz
最終的轉換格式如下:
chr1 refGene intron 763230 764381 1 + . Parent=NR_047525;
chr1 refGene CDS 764382 764484 2 + . Parent=NR_047525;
chr1 refGene intron 764485 787305 2 + . Parent=NR_047525;
chr1 refGene CDS 787306 787490 3 + . Parent=NR_047525;
chr1 refGene intron 787491 788049 3 + . Parent=NR_047525;
chr1 refGene CDS 788050 788146 4 + . Parent=NR_047525;
chr1 refGene intron 788147 788769 4 + . Parent=NR_047525;
chr1 refGene CDS 788770 794826 5 + . Parent=NR_047525;
chr1 refGene intergenic 794827 803449 . . . ID=NR_047525; name=LOC643837;|ID=NR_027055; name=FAM41C;
chr1 refGene ncRNA 803450 812182 . - . ID=NR_027055; name=FAM41C;
chr1 refGene CDS 803450 804055 3 - . Parent=NR_027055;
chr1 refGene intron 804056 809490 2 - . Parent=NR_027055;
chr1 refGene CDS 809491 810535 2 - . Parent=NR_027055;
chr1 refGene intron 810536 812124 1 - . Parent=NR_027055;
chr1 refGene CDS 812125 812182 1 - . Parent=NR_027055;
chr1 refGene intergenic 812183 852951 . . . ID=NR_027055; name=FAM41C;|ID=NR_026874; name=LOC100130417;
chr1 refGene ncRNA 852952 854817 . - . ID=NR_026874; name=LOC100130417;
chr1 refGene CDS 852952 853100 4 - . Parent=NR_026874;
chr1 refGene intron 853101 853400 3 - . Parent=NR_026874;
chr1 refGene CDS 853401 853555 3 - . Parent=NR_026874;
chr1 refGene intron 853556 854203 2 - . Parent=NR_026874;
chr1 refGene CDS 854204 854295 2 - . Parent=NR_026874;
chr1 refGene intron 854296 854713 1 - . Parent=NR_026874;
chr1 refGene CDS 854714 854817 1 - . Parent=NR_026874;
chr1 refGene intergenic 854818 861119 . . . ID=NR_026874; name=LOC100130417;|ID=NM_152486; name=SAMD11;
chr1 refGene mRNA 861120 879961 . + . ID=NM_152486; name=SAMD11;
chr1 refGene 5-UTR 861120 861180 . + . Parent=NM_152486;
chr1 refGene intron 861181 861300 1 + . Parent=NM_152486;
chr1 refGene 5-UTR 861301 861320 . + . Parent=NM_152486;
chr1 refGene CDS 861321 861393 1 + 0 Parent=NM_152486;
第6列為exon和intron的排列順序,第8列為翻譯偏移量,intergenic為新增兩個gene或transcript之間的距離等,加強版的gff格式~~~