[maker-devel] [Caution: Message contains Redirect URL content] InterProScan protein domain & AED physical evidence filtering
Jonathan Featherston
FeatherstonJ at arc.agric.za
Tue Nov 1 09:12:46 MDT 2016
Dear Allison
I'm not sure about your extra gene models but here is the script to perform quality filtering. A perl script I got from the forum somewhere (changed to txt in case it gets removed by mail server.
Regards
Jonathan
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://yandell-lab.org/pipermail/maker-devel_yandell-lab.org/attachments/20161101/db2aa6f6/attachment-0002.html>
-------------- next part --------------
#!/usr/bin/perl -w
use strict;
#use lib ('/home/mcampbell/lib');
#use PostData;
use Getopt::Std;
use vars qw($opt_s $opt_d $opt_a $opt_p $opt_c $opt_m $opt_u);
getopts('sda:pcmu');
use FileHandle;
#-----------------------------------------------------------------------------
#----------------------------------- MAIN ------------------------------------
#-----------------------------------------------------------------------------
my $usage = "\n\nquality_filter.pl: generates defualt and standard
gene builds from a maker geneated gff3_file with
iprscan data pushed onto column 9 using ipr_update_gff.\n
USAGE: quality_filter.pl -[options]<gff3_file>\n
OPTIONS: -d Prints transcripts with an AED <1 (MAKER default)
-s Prints transcripts with an AED <1 and/or Pfam domain
if in gff3 (MAKER Standard)
-a <number between 0 and 1> Prints transcripts with an
AED < the given value\n\n";
my $FILE1 = $ARGV[0];
#my $FILE2 = $ARGV[1];
die($usage) unless $ARGV[0] && ($opt_a || $opt_s || $opt_d);
my %LU_G;
my %LU_T;
build_lus($FILE1);
#build_lu_tid($FILE2);
filter($FILE1);
#PostData(\%LU_G);
#PostData(\%LU_T);
#-----------------------------------------------------------------------------
#---------------------------------- SUBS -------------------------------------
#-----------------------------------------------------------------------------
sub build_lus{
my $file = shift;
my %data;
my $fh = new FileHandle;
$fh->open($file);
while (defined(my $line = <$fh>)){
chomp($line);
last if $line =~ /^\#\#FASTA/;
next if $line =~ /^\#/;
my @array = split(/\t/, $line);
next unless $array[2] =~ /mRNA/;
my ($tid) = $array[8] =~ /ID\=(.+?);.*/;
my ($gid) = $array[8] =~ /Parent\=(.+?);.*/;
my @c9 = split(/\;/, $array[8]);
foreach my $x (@c9){
my ($k,$v) = $x =~ /(.+)\=(.+)/;
$data{$k}=$v;
}
#load the LU
if ($opt_s &&
(($data{'Dbxref'} && $data{'Dbxref'} =~ /Pfam/) ||
$data{'_AED'} < 1)){
$LU_G{$gid}=1;
$LU_T{$tid}=1;
}
elsif ($opt_d && $data{'_AED'} < 1){
$LU_G{$gid}=1;
$LU_T{$tid}=1;
}
elsif ($opt_a && $data{'_AED'} < $opt_a){
$LU_G{$gid}=1;
$LU_T{$tid}=1;
}
undef %data;
}
}
#-----------------------------------------------------------------------------
sub filter{
my $file = shift;
my $fh = new FileHandle;
$fh->open($file);
print "##gff-version 3\n";
while (defined(my $line = <$fh>)){
chomp($line);
last if $line =~ /^\#\#FASTA/;
next if $line =~ /^\#/;
my @array = split(/\t/, $line);
if ($array[2] eq 'gene'){
my ($id) = $array[8] =~ /ID=(\S+?);/;
print $line."\n" if defined($LU_G{$id});
}
elsif ($array[2] eq 'mRNA'){
my ($id) = $array[8] =~ /ID=(\S+?);/;
print $line."\n" if defined($LU_T{$id});
}
elsif ($array[2] eq 'exon'|
$array[2] eq 'CDS'|
$array[2] eq 'three_prime_UTR'|
$array[2] eq 'five_prime_UTR'){
my $bool = 0;
my ($ids) = $array[8] =~ /Parent=(\S+);?/;
# my ($ids) = $array[8] =~ /Parent=(\S+?);/;
$ids =~ s/;//;
my @ids_array = split(/,/, $ids);
foreach my $x (@ids_array){
if (defined($LU_T{$x})){
$bool++;
}
else{
$line =~ s/$x[^:]//;
}
}
print $line."\n" if $bool;
}
else{print $line."\n"}
}
$fh->close();
}
#-----------------------------------------------------------------------------
sub build_lu_gid{
my $file = shift;
my $fh = new FileHandle;
$fh->open($file);
while (defined(my $line = <$fh>)){
chomp($line);
$LU_G{$line}=1;
}
$fh->close();
}
#-----------------------------------------------------------------------------
sub build_lu_tid{
my $file = shift;
my $fh = new FileHandle;
$fh->open($file);
while (defined(my $line = <$fh>)){
chomp($line);
last if $line =~ /^\#\#FASTA/;
next if $line =~ /^\#/;
my @array = split(/\t/, $line);
if ($array[2] =~ 'mRNA'){
my ($tid) = $line =~ /ID=(.+?);/;
my ($gid) = $line =~ /Parent=(.+?);/;
if (defined($LU_G{$gid})){
$LU_T{$tid}=1;
}
}
}
$fh->close();
}
#-----------------------------------------------------------------------------
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://yandell-lab.org/pipermail/maker-devel_yandell-lab.org/attachments/20161101/db2aa6f6/attachment-0002.htm>
More information about the maker-devel
mailing list