1. 程式人生 > >取轉錄本fasta最長的當作基因fasta

取轉錄本fasta最長的當作基因fasta

#!/usr/bin/env perl
use warnings;
use strict;
use Bio::SeqIO;

die "perl $0 <fasta> > <outfile>\n" if(@ARGV != 1);

my @len = `fastalength $ARGV[0]`;
my %xsh = map { chomp; my($length,$name)=split /\s+/; $name=>$length } @len;

my %hash;
open FA, $ARGV[0] or die $!;
while(<FA>)
{
	chomp;
	next if($_ !~ /^>/);
	if(/gene:([^ ]+) transcript:([^ ]+)/)
	{
		if(exists $xsh{$2})
		{
			push @{$hash{$1}}, "$2 $xsh{$2}";
		}
	}
}

my %trans;
open OUT, ">longest_transcript.list" or die $!;
foreach my $g(keys %hash)
{
	my $tmp = 0;
	my $lt;
	foreach my $t(@{$hash{$g}})
	{
		my @x = split / /, $t;
		($x[1] > $tmp) ? $lt = $x[0] : next;
	}
	$trans{$lt} = 0;
	print OUT "$lt\n";
}

$/ = "\n>";
open FB, $ARGV[0] or die $!;
while(<FB>)
{
	chomp;
	s/^>//;
	my @tmp = split /\n/;
	my @t = split /\s+/, $tmp[0];
	if(exists $trans{$t[0]})
	{
		print ">$_\n";
	}
}

&tt;
sub tt{
	my $t = localtime;
	print STDERR "$t\n";
}

=cut
my ($seq_hash,$gene_hash) = &read_fasta(shift @ARGV);

foreach my$gene(sort keys %$gene_hash)
{
	my @names = @{$gene_hash->{$gene}};
	my $name;
	my $len = 0;
	foreach (@names)
	{
		if ($seq_hash->{$_}->{len} > $len)
		{
			$name = $_;
			$len = $seq_hash->{$_}->{len};
		}
	}

	print "$name\n$seq_hash->{$name}{seq}\n";
}

sub read_fasta
{
	my $file = shift;

	my $inseq = Bio::SeqIO->new(-file=>$file,-format=>"fasta");
	my %hash;
	my %gene;
	while(my$seq=$inseq->next_seq)
	{
		my $name = $seq->id;
		my $gene = $1 if ($seq->desc =~ /gene:(\S+)/);
		
		$hash{$name}{seq} = $seq->seq;
		$hash{$name}{len} = $seq->length;
		
		push @{$gene{$gene}} , $name;
	}
	return (\%hash,\%gene);
}