#!/usr/bin/perl =head1 NAME blast_result_HSP_rep_check.pl =head1 SYNOPSIS write this... Make sure that you include a "id" auto-incremented primary key to identify each HSP row. For Instance, with the SQL: alter table blast_res_hvgissr_bacpacpseudo add column HSP_id int unsigned not null auto_increment after bit_score , add primary key (HSP_id) , type=MyISAM Include also a new field (column) to hold Repeated_HSP info: alter table blast_res_hvgissr_bacpacpseudo add column Repeated_HSP tinyint(1) unsigned not null default '0' after bit_score , type=MyISAM These need to be updated in the database prior to running this script =head1 DESCRIPTION write this =head1 ARGUMENTS -host -user -p[ass] -dbname -help =cut use strict; use DBI; use Getopt::Long; my $host = '0.0.0.0'; # Set the IP address of your mysql server here. my $dbname = 'est_ssr'; # The database with the blast_results_table to check. my $tab_name = 'blast_res_osgi_chr1pseudo'; # The table to check. my $dbuser = undef; my $dbpass = undef; #my $dbuser = 'perlscript'; #my $dbpass = 'perlscript'; my $sth = undef; my $help = undef; # The columns to keep from the SQL query my %table_columns=qw( Qacc 1 Sacc 1 ); my %table2_columns=qw( Qacc 1 Sacc 1 Qstart 1 Qend 1 Sstart 1 Send 1 E_val 1 bit_score 1 Repeated_HSP 1 HSP_id 1 ); my $best_HSP_ref=undef; &GetOptions( 'host=s' => \$host, 'dbname=s' => \$dbname, 'user=s' => \$dbuser, 'p|pass=s' => \$dbpass, 'h|help+' => \$help, ); if( defined $help ) { system("perldoc $0"); exit(0); }if (!defined $dbuser) { warn "Please provide a username with access to database (such as guest)\n\n"; system("perldoc $0"); exit(0); } my %mockup_table_results=(Qacc => "TC96141", Sacc => "chr1_ctg1"); #my %mockup_table_results=(Qacc => "AL820063", Sacc => "AL606692"); print STDOUT "\n Will be updating the table $dbname.$tab_name \n\n"; #### need to connect to database and retrieve HSPs info #### for each query::subject pair my $dsn = "DBI:mysql:database=$dbname;host=$host"; my $dbh = DBI->connect($dsn,$dbuser,$dbpass,{RaiseError=>1}) or die "Could not connect, error was:\n\t$DBI::errstr\n\n"; # This Retrieves BLAST list of Query::Subject pairs info from db my $QuerySubj_Pairs_array_ref = &QueryDB; # OR use mockup for test... # The array ref is equal to and anonymous array '[]' that contains in its first element the ref to a hash #my $QuerySubj_Pairs_array_ref = [\%mockup_table_results]; my $HSPs_array_ref= undef; # Prepare the statement to retrieve the HSPs for current Query::Subj pair my $getTheseHSPs = qq[ SELECT Qacc, Sacc, Qstart, Qend, Sstart, Send, E_val, bit_score, Repeated_HSP, HSP_id FROM $dbname.$tab_name WHERE Qacc = ? AND Sacc = ? ORDER BY Qstart, Qend #limit 5 ]; my $sth_getHSPs = $dbh->prepare($getTheseHSPs); # Prepare the query statement to update database, before getting into loop my $updateSQLstr = qq[ UPDATE $dbname.$tab_name SET Repeated_HSP=1 WHERE HSP_id=? ]; my $sth_updateBLASTtable = $dbh->prepare($updateSQLstr); # print "The contents of the retrieved table are: \n"; my $records =1; my $previous_HSPref=undef; my $flag_as_bad = undef; # BIG LOOP foreach my $QuerySubjPair (@{$QuerySubj_Pairs_array_ref}) { print STDERR "\n================\n== New Query :: Subject pair to compare.\n================\n"; # Obtain the HSPs eval{ $sth_getHSPs->execute($QuerySubjPair->{Qacc} , $QuerySubjPair->{Sacc}); }; die "\n\tBIG Problem!, can't execute query. error is:\n $@ \n" if ($@); $HSPs_array_ref = $sth_getHSPs->fetchall_arrayref(\%table2_columns); $best_HSP_ref = $previous_HSPref = ${$HSPs_array_ref}[0]; # store the first one for now print STDERR "\tQacc \tSacc\tQstart\tQend\tbit_score\tHSP_id\tRepeated_HSP\n"; # while in the same QUERY::SUBJ pairs foreach my $HSProw (@$HSPs_array_ref) { # for each HSP record in QuerySubj Pairs table print STDERR "\t$HSProw->{Qacc}\t$HSProw->{Sacc}\t$HSProw->{Qstart}\t$HSProw->{Qend}\t$HSProw->{bit_score}\t$HSProw->{HSP_id}\n"; #if object is returned as array of hashes # Test for the adjacency of this HSP to previous (records are expected to be sorted by Qstart, Implement this in the SQL above ) # If overlapped, Is this one better than previous? if (&OverlapsPrevious($best_HSP_ref,$HSProw)) { # table required to be sorted left to right for this to work #print STDERR "Overlap found with best HSP so far based on score... \n"; # Compare statisitics for this HSP with best so far, # set flag to 1 if HSP overlpas another one that has better bitscore ($best_HSP_ref,$flag_as_bad) = &StoreBestHSP($best_HSP_ref,$HSProw); &UpdateTable($flag_as_bad); $previous_HSPref = $HSProw; } else { # IF not overlap to previous(the HSP to the left of this one), set this one as best score so far $previous_HSPref = $best_HSP_ref = $HSProw; # print STDERR " New BEST non-overlapping HSP : $best_HSP_ref->{HSP_id} . \n "; } $records++; } } # END BIG LOOP # Finish statements $sth_updateBLASTtable->finish; $sth_getHSPs ->finish; ### DISCONNECT FROM DB $dbh->disconnect or warn "Disconnection from DB failed at end of script: $DBI::errstr\n"; $records=$records-1; print "\nThe number of HSPs processed was $records. bye\n"; ###################### END OF MAIN ################ ##### ##### sub QueryDB { # Design Query my $statement=qq[ SELECT Qacc, Sacc FROM $dbname.$tab_name GROUP by Qacc, Sacc ORDER by HSP_id #LIMIT 5 ]; # print STDERR "First SQL Statement is:\n$statement\n"; $sth = $dbh->prepare($statement); # Execute query. eval{ $sth->execute(); }; die "\n\tBIG Problem!, can't execute query. error is:\n $@ \n" if ($@); my $array_ref= $sth->fetchall_arrayref(\%table_columns); # only the selected colums, as array of hashes # if left empty, gets all columns # my $array_ref= $sth->fetchall_arrayref() # OR select colums by number, returning ref to array of arrays ... #my $array_ref= $sth->fetchall_arrayref( [0,1,4,2,3,9] ); my $table1size= @$array_ref; print STDERR "The number of query|subject pairs in result set is $table1size\n\n"; $sth->finish; return $array_ref; } #### ###### end sub #### ##### ##### sub OverlapsPrevious { # Based on MSPcrunch2 algorithms my ($PreviousHSP,$ThisHSP) = @_; if ($ThisHSP->{HSP_id} == $PreviousHSP->{HSP_id}) { #print STDERR "Warning, If you want met to compare the overlap of one HSP with itself...I say there is no overlap. \n"; return 0; } ####_____________________________________ # SS1 SE1 SS2 SE2 my $ThisQstart = $ThisHSP->{Qstart}; # --=====------=======--- my $PreviousQend = $PreviousHSP->{Qend}; # ||||| /////// QueryGap = QS2-QE1-1 my $ThisSstart = $ThisHSP->{Sstart}; # --=====----=======--- my $PreviousSend = $PreviousHSP->{Send}; # QS1 QE1 QS2 QE2 # correction to returned subject coordinates: # (if in other strand orientation (-), coordinates are backwards) # To fix, use the absolute value of difference my $QueryGap = ( $ThisQstart - $PreviousQend -1); my $SbjGap = (abs($ThisSstart - $PreviousSend) -1); # return the greater of two general scalars : ($a,$b)[$a<$b] # We want the minimum, so invert the sign my $HSP_dist = (($QueryGap,$SbjGap)[$QueryGap>$SbjGap]); #print STDERR "HSPdist is $HSP_dist , QueryGap is $QueryGap and SubjGap is $SbjGap \n"; # IF $HSPdist is negative (but we accept up to 20 bases overlap), the two HSPs are overlapping, # This obviously means than one of the two (Query or SUbject of the two HSPs) has a repeated region # for which we are detecting multiple hits. if ( ($HSP_dist < -20) ) { # Return TRUE print STDERR "YES, is in overlap (HSPdist = $HSP_dist)... "; return 1; } else { # Return FALSE print STDERR "NOT in overlap\n"; return 0; } } ##### ####### end sub ##### ##### ##### sub StoreBestHSP { # called like: ($best_HSP_ref,$flag_as_bad)= &StoreBestHSP($best_HSP_ref,$HSProw); # returns the High score and the low score HSPs # print "\n$HSProw->{Qacc}\t$HSProw->{Sacc}\t$HSProw->{bit_score}\t$HSProw->{HSP_id}\n"; #if object is returned as array of hashes my $bestHSPref = shift; my $currentHSPref = shift; #my $bestHSP_QSpair = $bestHSPref->{Qacc}."_".$bestHSPref->{Sacc}; #my $currentHSP_QSpair = $currentHSPref->{Qacc}."_".$currentHSPref->{Sacc}; if($currentHSPref->{bit_score} > $bestHSPref->{bit_score}) { #print STDERR "NEW BEST HSP, this HSP overlaps previous best, but has better score. \n"; return ($currentHSPref,$bestHSPref); } else { #print STDERR "this HSP not as good as previous one.\n"; return ($bestHSPref,$currentHSPref); # else, the older is still the best } } ###### end sub #### ##### ##### sub UpdateTable { my $targetHSP = shift; my $HSPnum = $targetHSP->{HSP_id}; # called as UpdateTable$flag_as_bad); print "Updating table for HSP $HSPnum.\n"; eval{ $sth_updateBLASTtable->execute($HSPnum); # print STDOUT "Please uncomment lines in UpdateTable function if you want to save to DB\n"; }; die "\n\tBIG Problem!, can't execute query. error is:\n $@ \n" if ($@); return; } #### ####### END sub ####