#!/usr/bin/perl -w

#This script is for processing large-scale EST data
#This script is used to remove sequences with phred score less than 20
#at the beginning and end of each read in both fasta file and its corresponding
#quality score file
#The script reads raw fasta file
#You should run this script right after you finished phred, 
#then do cross_match to find vector sequence
#At command line type phredclean.pl your_raw_fasta_filename.fasta
#Author and contact info: David Hummel at hummel@pw.usda.gov


($prog) = ($0 =~ m#([\w\.]+)$#);
$file_suff = '_1'; # appended before extensions
$phred_qual_ext = '.qual';
$cutoff = 0;

# parse args
unless (@ARGV) {&usage;}
#shift @ARGV if ( ($cutoff) = ($ARGV[0] =~ /^-(\d+)$/) );
if ( $ARGV[0] =~ /^-(\d+)$/ ) {$cutoff = $1; shift @ARGV;}

# iterate files
while ($file = shift @ARGV) {

    # check files
    unless (-e $file) {print "skipping $file: doesn't exist\n"; next;}
    my $qualfile = $file; $qualfile .= $phred_qual_ext;
    unless (-e $qualfile) {print "skipping $file: $qualfile doesn't exist\n"; next;}

    my %seqs; my %quals;

    # grab fasta sequences
    open (FAS, $file) || die "can't open $file: $!";
    while (<FAS>) {
	chomp;
	my $name = ''; # (re)set name
	my $nameline = ''; # (re)set nameline
	my @nameline = (); # (re)set nameline
	my $sequence = ''; # (re)set sequence
	my @sequence = (); # (re)set sequence
	# see if we're at > line
	if ( ($nameline,$name) = /^(\s*>(\S+).*)$/ ) {
	    do {
		chomp ($_ = <FAS>);
		$sequence .= $_ unless /^\s*>|^\s*$/;
	    } until ( /^\s*>/ || eof );
	    $sequence =~ tr/a-z/A-Z/; # make uppercase
	    $sequence =~ s/\s//g; # remove any spaces
	    @sequence = split (//,$sequence);
	    @nameline = split (/\s+/,$nameline);
	    $seqs{$name} = [\@nameline,\@sequence];
	    redo if /^\s*>/;
	}
    }
    close (FAS);

    # grab quality values
    open (QUAL, $qualfile) || die "can't open $qualfile: $!";
    while (<QUAL>) {
	chomp;
	my $name = ''; # (re)set name
	my $nameline = ''; # (re)set nameline
	my @nameline = (); # (re)set nameline
	my @quality = (); # (re)set quality
	# see if we're at > line
	if ( ($nameline,$name) = /^(\s*>(\S+).*)$/ ) {
	    do {
		chomp ($_ = <QUAL>);
		my @qual = split unless /^\s*>|^\s*$/;
		push (@quality, @qual) if (@qual);
	    } until ( /^\s*>/ || eof );
	    @nameline = split (/\s+/,$nameline);
	    $quals{$name} = [\@nameline,\@quality];
	    redo if /^\s*>/;
	}
    }
    close (QUAL);

    # iterate through seqs
    my $fas_out = $file; $fas_out =~ s/\./$file_suff\./;
    my $qual_out = $fas_out . $phred_qual_ext;
    open (FASOUT, ">$fas_out") || die "can't open $fas_out: $!";
    open (QUALOUT, ">$qual_out") || die "can't open $qual_out: $!";
    for (sort keys %seqs) {

	# remove low qual seqs from beginning
	while (@{${$quals{$_}}[1]} && $quals{$_}->[1]->[0] eq 0) {
	    shift @{${$quals{$_}}[1]};
	    shift @{${$seqs{$_}}[1]};
        }

	# remove low qual seqs from end
	while (@{${$quals{$_}}[1]} && $quals{$_}->[1]->[-1] eq 0) {
	    pop @{${$quals{$_}}[1]};
	    pop @{${$seqs{$_}}[1]};
        }

        #print "$_: ", scalar @{${$quals{$_}}[1]}, ':', scalar @{${$seqs{$_}}[1]}, "\n";

        # reset nameline values
        $seqs{$_}->[0]->[1] = scalar @{${$seqs{$_}}[1]};
        $seqs{$_}->[0]->[2] = 0;
        $seqs{$_}->[0]->[3] = scalar @{${$seqs{$_}}[1]};
        $quals{$_}->[0]->[1] = scalar @{${$quals{$_}}[1]};
        $quals{$_}->[0]->[2] = 0;
        $quals{$_}->[0]->[3] = scalar @{${$quals{$_}}[1]};

        # print sequence/quality
        if ( @{${$seqs{$_}}[1]} && scalar @{${$seqs{$_}}[1]} >= $cutoff ) {
            &printseq( \*FASOUT, $seqs{$_}->[0], join ('',@{${$seqs{$_}}[1]}) );
            &printqual( \*QUALOUT, $quals{$_}->[0], $quals{$_}->[1] );
        }

    }
    close (FASOUT);
    close (QUALOUT);

}

#################

sub printseq {
    my ($fh,$nameref,$sequence) = @_;
    my ($name,$readlen,$prelowqual,$goodqual,$mach) = @$nameref;
    printf $fh ("%s   %4d   %4d   %4d  %s\n",$name,$readlen,$prelowqual,$goodqual,$mach);
    while ($sequence) {
	my $sub = substr ($sequence, 0, 50, "");
	print $fh "$sub\n";
    }
}

sub printqual {
    my ($fh,$nameref,$qualref) = @_;
    my ($name,$readlen,$prelowqual,$goodqual,$mach) = @$nameref;
    my @quality = @$qualref;
    my $pos = 0;
    printf $fh ("%s   %4d   %4d   %4d  %s\n",$name,$readlen,$prelowqual,$goodqual,$mach);
    while (@quality) {
	my $qual = shift @quality;
	$pos++;
	print $fh $qual;
	if ($pos == 17) {print $fh "\n"; $pos = 0;}
	else {print $fh ' '}
    }
    print $fh "\n" unless $pos == 0;
}

sub usage {
print <<USAGE;

remove low quality bases from phred output files
generate new fasta file and quality file

usage: $prog [-cutoff] file.fasta [...]

       cutoff = base count of cleaned sequences below which
                sequences/qualities will not be printed
       file.fasta = phred fasta output file

file.fasta$phred_qual_ext must also exist in the current directory

output files: file$file_suff.fasta and file$file_suff.fasta$phred_qual_ext

USAGE
exit;
}

