#!/usr/local/bin/perl -s

#--------------------------------------------------------
#  File Name: sweep2.pl                                  
#                                                       
#  Description: This perl script take  *_1.qual file 
#		and *_1.fasta fileclean out all those  
#		masked vector seq data out.  Then it write  
#		to a new file with clean data. 
#  Created: 02/23/2000 jt
#  Modified: 03/07/2000 jt
#
#  This script is an older version of cmclean.pl
#--------------------------------------------------------

$extension = 'ABI|SCF';
#$Textension = '\.(a|b|c|d|e|f|g|h)(0|1)\d';
$Textension = '\.scf|\.ab1|\.abi|\-F|\-R|\.s1|\.r1|\.s2|\.x1|\.ABD';
#$Textension = '\990802|\991203|\991208|\991213|\991217|\991220|\991221|\991222|\991223|\.ABD';

$file = $ARGV[0];
$outfile = $file . '.out';
@extn = split(/\./, $file);
@fname = split(/\_/, $extn[0]);
$fnew = $fname[0] . "_2." . $extn[1];

$j = $k = $z = 0;

#-----Getting information from the .out file------------
open(OUT, $outfile);
@info = <OUT>;
close(OUT);

$numinfo = $#info;

for ($i=0; $i <= $numinfo; $i++) {
   if ($info[$i] =~ /$Textension/) {
      if ($info[$i] =~ /Residues\:/) {
	$Ttable[$z] = $info[$i];
	$z++;
      } else {
	$infodata[$j] = $info[$i];
	$j++;
      }
   }
}

foreach (@Ttable) {
   @Tdata = split(/\s+/, $_);

   $Tname[$k] = $Tdata[0];
   $Tstart[$k] = $Tdata[5];
   $Tend[$k] = $Tdata[7];
   $k++;
}
$Tnum = $#Tname;

$k = 0;
foreach $infoline (@infodata) {
   @infoitem = split(/\s+/, $infoline);

   @parnum = split(//, $infoitem[8]);
   shift(@parnum);			# remove (
   pop(@parnum);			# remove )

   $numpar = $#parnum;
   $finalnum = $parnum[0];
   for ($i=1; $i <= $numpar; $i++) {
	$finalnum = $finalnum . $parnum[$i];
   }

   $infoname[$k] = $infoitem[5];
   $infostart[$k] = $infoitem[6];
   $infoend[$k] = $infoitem[7];
   $infoleft[$k] = $finalnum;
   $k++;
}

$numinfo = $#infoname;

#--------------------------------------------------------

#-----Cleaning the vector from the fasta file------------
open(INFO, $file);
@lines = <INFO>;
close(INFO);

chomp @lines;
$allseq = "@lines";
@seqbreaks = split(/\>/, $allseq);
shift(@seqbreaks);

open(CLEAN, ">$fnew");

foreach $eachseq(@seqbreaks) {

   if ($eachseq =~ /SCF/) {$type = 'SCF';}
   else {$type = 'ABI';}

   @seqadjust = split(/$extension/, $eachseq);
   @seqinfo   = split(/\s+/, $seqadjust[0]);

   $seqadjust[1] =~ s/ //g;
   $seqadjust[1] =~ tr/a-z/A-Z/;

   $total = $seqinfo[3];
   $start = '0';
   $num = $seqinfo[3];
   $flag = $Tflag = $match = '0';
						# (1)  AAAAAAAAAA "0 0 0"
   if (($seqinfo[1] == 0) && ($seqinfo[2] == 0) && ($seqinfo[3] == 0)) {}
   else {
	for ($i=0; $i <= $numinfo; $i++) {	# Check if the name is in the Table
	   if ($seqinfo[0] eq $infoname[$i]) {
		$index = $i;
		$flag = '1';
		$match++;
	   }
	}

	for ($i=0; $i <= $Tnum; $i++) {		# Check if the name is in the T table
	   if ($seqinfo[0] eq $Tname[$i]) {
		$Tindex = $i;
		$Tflag = 1;
	   }
	}

	if ($flag == 0) {			# Not in Table (2) ATCGATCG
	   print CLEAN "\>$seqinfo[0]    $total      $start      $num  $type\n";
	   @section = split(//, $seqadjust[1]);
	   $j = 1;
	   for ($i =  $seqinfo[2]; $i < ($seqinfo[2] + $seqinfo[3]) ; ++$i ){
	   	if ($j > 50) { print CLEAN "\n"; $j = 1; }
	   	print CLEAN "$section[$i]"; 
	   	$j++;
   	   }
	   print CLEAN "\n";
	}elsif ($flag == 1) {			
						# In Table (3) xxxxxxx A=1 B=N C=0
	   if (($infostart[$index] == 1) && ($infoend[$index] == $seqinfo[1]) && ($infoleft[$index] == 0)) {}
						# (4) & (5) xxxxATCGATCGxxxx A=1 B<N C<N
	   elsif (($infostart[$index] == 1) && ($infoend[$index] < $seqinfo[1]) && ($infoleft[$index] < $seqinfo[1])) {
             if ($Tflag == 1) {			# there are T
	   	$total = $num = $seqinfo[1] - $Tend[$Tindex];
		$start = 0;
		$begin = $Tend[$Tindex];
		$end = $seqinfo[1];
	     } else {
	        $total = $num = $infoleft[$index];
                $start = '0';
		$begin = $infoend[$index];
		$end = $infoend[$index] + $infoleft[$index];
             }
                print CLEAN "\>$seqinfo[0]     $total      $start      $num  $type\n";
		@section = split(//, $seqadjust[1]);
                $j = 1;
                
                for ($i = $begin; $i < $end; ++$i) {
                   if ($j > 50) { print CLEAN "\n"; $j = 1; }
                   print CLEAN "$section[$i]";
                   $j++;
                }
                print CLEAN "\n";
           }
                                                # (6)  ATCGxxxx  A>1 B=N C=0
           elsif (($infostart[$index] > 1) && ($infoend[$index] == $seqinfo[1]) && ($infoleft[$index] == 0)) {
             if ($Tflag == 1) {			# there are T
		   $total = $num = $Tstart[$Tindex] - 1;
		   $start = 0;
		   $begin = 0;
		   $end = $Tstart[$Tindex] - 1;
	     } else {				# there is no T
		$total = $num = $infostart[$index] - 1;
                $start = 0;
		$begin = 0;
		$end = $infostart[$index] - 1;
             }
	     if ($match > 1) {			# there is more than one line in match table
	     
		$begin = $infoend[$index - 1];
		$end = $infostart[$index] - 1;
		$total = $num = $infostart[$index] - $infoend[$index - 1] - 1;
	     }
	      
	     print CLEAN "\>$seqinfo[0]    $total      $start      $num  $type\n";
             @section = split(//, $seqadjust[1]);
             $j = 1;
             for ($i = $begin; $i < $end; ++$i) {
                if ($j > 50) { print CLEAN "\n"; $j = 1; }
                print CLEAN "$section[$i]";
                $j++;
             }  
             print CLEAN "\n";
           }
						# (7)  ATCGxxxxATCG  A>1 B<N C<N
	   elsif (($infostart[$index] > 1) && ($infoend[$index] < $seqinfo[1]) && ($infoleft[$index] < $seqinfo[1])) {
	      if ($infostart[$index] > $infoleft[$index]) {		# (7a) ATCGxxxxAT
		if ($Tflag == 1) {		# there are T
		   $total = $Tstart[$Tindex];
		   $start = 0;
		   $begin = 0;
		   $end = $Tstart[$Tindex] - 1;
		} else {	 		# there is no T
		   $total = $num = $infostart[$index];
		   $start = 0;
		   $begin = 0;
		   $end = $infostart[$index] - 1;
		}
	      } elsif ($infostart[$index] < $infoleft[$index]) {	# (7b) ATxxxxATCG
		if ($Tflag == 1) {		# there are T
		   $total = $seqinfo[1] - $Tend[$Tindex];
		   $start = 0;
		   $begin = $Tend[$Tindex] + 1;
		   $end = $seqinfo[1];
		} else {			# there is no T
		   $total = $num = $seqinfo[1] - $infoend[$index];
		   $start = 0;
		   $begin = $infoend[$index];
		   $end = $seqinfo[1];
		}
	      }
	      if ($total != 0 ) {
		   print CLEAN "\>$seqinfo[0]    $total      $start      $num  $type\n";
		   @section = split(//, $seqadjust[1]);
                   $j = 1;
                   for ($i = $begin; $i < $end; ++$i) {
                      if ($j > 50) { print CLEAN "\n"; $j = 1; }
                      print CLEAN "$section[$i]";
                      $j++;
                   }
                   print CLEAN "\n";
	      }
	   }
	}
   }
}
close(CLEAN);
#---------------------------------------------------------------


