取轉錄本fasta最長的當作基因fasta
阿新 • • 發佈:2019-01-26
#!/usr/bin/env perl use warnings; use strict; use Bio::SeqIO; die "perl $0 <fasta> > <outfile>\n" if(@ARGV != 1); my @len = `fastalength $ARGV[0]`; my %xsh = map { chomp; my($length,$name)=split /\s+/; $name=>$length } @len; my %hash; open FA, $ARGV[0] or die $!; while(<FA>) { chomp; next if($_ !~ /^>/); if(/gene:([^ ]+) transcript:([^ ]+)/) { if(exists $xsh{$2}) { push @{$hash{$1}}, "$2 $xsh{$2}"; } } } my %trans; open OUT, ">longest_transcript.list" or die $!; foreach my $g(keys %hash) { my $tmp = 0; my $lt; foreach my $t(@{$hash{$g}}) { my @x = split / /, $t; ($x[1] > $tmp) ? $lt = $x[0] : next; } $trans{$lt} = 0; print OUT "$lt\n"; } $/ = "\n>"; open FB, $ARGV[0] or die $!; while(<FB>) { chomp; s/^>//; my @tmp = split /\n/; my @t = split /\s+/, $tmp[0]; if(exists $trans{$t[0]}) { print ">$_\n"; } } &tt; sub tt{ my $t = localtime; print STDERR "$t\n"; } =cut my ($seq_hash,$gene_hash) = &read_fasta(shift @ARGV); foreach my$gene(sort keys %$gene_hash) { my @names = @{$gene_hash->{$gene}}; my $name; my $len = 0; foreach (@names) { if ($seq_hash->{$_}->{len} > $len) { $name = $_; $len = $seq_hash->{$_}->{len}; } } print "$name\n$seq_hash->{$name}{seq}\n"; } sub read_fasta { my $file = shift; my $inseq = Bio::SeqIO->new(-file=>$file,-format=>"fasta"); my %hash; my %gene; while(my$seq=$inseq->next_seq) { my $name = $seq->id; my $gene = $1 if ($seq->desc =~ /gene:(\S+)/); $hash{$name}{seq} = $seq->seq; $hash{$name}{len} = $seq->length; push @{$gene{$gene}} , $name; } return (\%hash,\%gene); }