[maker-devel] [Caution: Message contains Redirect URL content] InterProScan protein domain & AED physical evidence filtering

Jonathan Featherston FeatherstonJ at arc.agric.za
Tue Nov 1 09:12:46 MDT 2016


Dear Allison

I'm not sure about your extra gene models but here is the script to perform quality filtering. A perl script I got from the forum somewhere (changed to txt in case it gets removed by mail server.

Regards
Jonathan
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://yandell-lab.org/pipermail/maker-devel_yandell-lab.org/attachments/20161101/db2aa6f6/attachment-0002.html>
-------------- next part --------------
#!/usr/bin/perl -w 

use strict;

#use lib ('/home/mcampbell/lib');

#use PostData;

use Getopt::Std;

use vars qw($opt_s $opt_d $opt_a $opt_p $opt_c $opt_m $opt_u);

getopts('sda:pcmu');

use FileHandle;



#-----------------------------------------------------------------------------

#----------------------------------- MAIN ------------------------------------

#-----------------------------------------------------------------------------

my $usage = "\n\nquality_filter.pl: generates defualt and standard 

gene builds from a maker geneated gff3_file with 

iprscan data pushed onto column 9 using ipr_update_gff.\n                                       

USAGE: quality_filter.pl -[options]<gff3_file>\n                                                                               

OPTIONS: -d  Prints transcripts with an AED <1 (MAKER default)         

         -s  Prints transcripts with an AED <1 and/or Pfam domain 

             if in gff3 (MAKER Standard)

         -a  <number between 0 and 1> Prints transcripts with an 

             AED < the given value\n\n";



my $FILE1 = $ARGV[0];

#my $FILE2 = $ARGV[1];

die($usage) unless $ARGV[0] && ($opt_a || $opt_s || $opt_d);



my %LU_G;

my %LU_T;



build_lus($FILE1);

#build_lu_tid($FILE2);

filter($FILE1);

#PostData(\%LU_G);

#PostData(\%LU_T);

#-----------------------------------------------------------------------------

#---------------------------------- SUBS -------------------------------------

#-----------------------------------------------------------------------------

sub build_lus{

    my $file = shift;

    my %data;

    my $fh = new FileHandle;

    $fh->open($file);



    while (defined(my $line = <$fh>)){

        chomp($line);

        last if $line =~ /^\#\#FASTA/;

        next if $line =~ /^\#/;



        my @array = split(/\t/, $line);

        next unless $array[2] =~ /mRNA/;

        my ($tid) = $array[8] =~ /ID\=(.+?);.*/;

	my ($gid) = $array[8] =~ /Parent\=(.+?);.*/;

        

	my @c9 = split(/\;/, $array[8]);

        foreach my $x (@c9){

            my ($k,$v) = $x =~ /(.+)\=(.+)/;

            $data{$k}=$v;

        }

	#load the LU

	if ($opt_s && 

	    (($data{'Dbxref'} && $data{'Dbxref'} =~ /Pfam/) ||

	     $data{'_AED'} < 1)){

	    $LU_G{$gid}=1;

	    $LU_T{$tid}=1;

	}

	elsif ($opt_d && $data{'_AED'} < 1){

	    $LU_G{$gid}=1;

	    $LU_T{$tid}=1;

	}

	elsif ($opt_a && $data{'_AED'} < $opt_a){

	    $LU_G{$gid}=1;

	    $LU_T{$tid}=1;

	}

	undef %data;

    }



}

#-----------------------------------------------------------------------------

sub filter{

    my $file = shift;



    my $fh = new FileHandle;

    $fh->open($file);

    print "##gff-version 3\n";

    while (defined(my $line = <$fh>)){

        chomp($line);



        last if $line =~ /^\#\#FASTA/;

        next if $line =~ /^\#/;

        my @array = split(/\t/, $line);

	

        if ($array[2] eq 'gene'){

	    my ($id) = $array[8] =~ /ID=(\S+?);/;

	    print $line."\n" if defined($LU_G{$id}); 

	}

	elsif ($array[2] eq 'mRNA'){

	    my ($id) = $array[8] =~ /ID=(\S+?);/;

	    print $line."\n" if defined($LU_T{$id}); 

	}

	elsif ($array[2] eq 'exon'|

	    $array[2] eq 'CDS'|

	    $array[2] eq 'three_prime_UTR'|

	    $array[2] eq 'five_prime_UTR'){



	    my $bool = 0;

	    my ($ids) = $array[8] =~ /Parent=(\S+);?/;

#	    my ($ids) = $array[8] =~ /Parent=(\S+?);/;

	    $ids =~ s/;//;



	    my @ids_array = split(/,/, $ids);

	    

	    foreach my $x (@ids_array){

		if (defined($LU_T{$x})){

		    $bool++;

		}

		else{

		    $line =~  s/$x[^:]//;

		}

	    }

	    print $line."\n" if $bool;

	}



	else{print $line."\n"}

    }

    $fh->close();

    

}

#-----------------------------------------------------------------------------

sub build_lu_gid{



    my $file = shift;       

    

    my $fh = new FileHandle;

    $fh->open($file);

    

    while (defined(my $line = <$fh>)){

	chomp($line);

	$LU_G{$line}=1;

    }

    $fh->close();

}

#-----------------------------------------------------------------------------

sub build_lu_tid{



    my $file = shift;       

    

    my $fh = new FileHandle;

    $fh->open($file);

    

    while (defined(my $line = <$fh>)){

	chomp($line);

	

	last if $line =~ /^\#\#FASTA/;

        next if $line =~ /^\#/;

        my @array = split(/\t/, $line);



        if ($array[2] =~ 'mRNA'){

	    my ($tid) = $line =~ /ID=(.+?);/;

            my ($gid) = $line =~ /Parent=(.+?);/;

	    if (defined($LU_G{$gid})){

		$LU_T{$tid}=1;

	    }

	}

    }

    $fh->close();

}

#-----------------------------------------------------------------------------



-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://yandell-lab.org/pipermail/maker-devel_yandell-lab.org/attachments/20161101/db2aa6f6/attachment-0002.htm>


More information about the maker-devel mailing list