#!/usr/bin/perl
# fastaFileCheck.pl
# 
# Checks a Fasta file, namely when there are additional lines after the > line that contain text instead of sequence data.
# usage: fastaFileCheck.pl filetocheck

=head1 Name

fastaFileCheck.pl - a utility for checking the integrity of fasta formatted files

=head1 Synopsys

perl fastaFileCheck.pl [-simvhD] dnafastafile

perl fastaFileCheck.pl -p[simvhD] proteinfastafile

Checks the structure of a DNA fasta file (or protein fasta files using the -p option) and prints out various information. For a fast check of a file, use the -s option, which prints a summary report only.

=head1 Description

The program checks

=over 4

=item (1)

if letters other than A,T,G,C and N occur in lines that do not start with > (or letters corresponding to the one letter amino acid code +X +* when the protein option (-p) is used). 
Fewer than 80% A,T,G,C and N (or non-amino acid letters) are considered a severe problem 
(such errors will break most parsers etc.).

=item (2)

that there are no empty lines in the files. 

=item (3)

whether duplicate identifiers are present in the file.

=item (4)

whether identifiers exceed a length of 50 characters.

=item (5)

whether there are lines that are over 100000 characters long (the total sequence length of an entry
can be infinitely long - that is not checked. Line length here means a stream of characters until a 
carriage return is encountered. The original fasta definition calls for a line length of 60 characters, but this is not checked as most fasta files don't adhere to this.

=back

Lines with more than 20% non-DNA (or non-protein) characters are counted as severe errors, minor error with less.

=head2 Options

=over 4

=item -p

Use for files containing protein sequences.

=item -s

Print summary only.

=item -i

Print identifiers only, and summary.

=item -m

Ignore minor problems.

=item -v

Print version of the program.

=item -h

Print help message

=item -D

Display sequence length distribution

=back

=head1 Version

Version 2.3 April 2, 2004

=head2 Version History

 2.0 2002/02/08 Added command line options
 2.1 2002/02/16 Added protein option
                duplicate id checking
                line length checking
                help information
 2.2 2002/09/17 Fixed a bug that reported too many empty lines
 2.3 2004/04/02 Added option D to display sequence length distribution
                Removed option S. Min and max lengths are now always displayed.

=head1 Author

 Lukas Mueller

 The Arabidopsis Information Resource. Copyright (c) 2001, 2002. 
 The Carnegie Institution of Washington, Department of Plant Biology. Copyright (c) 2001, 2002.
 Cornell University. Copyright (c) 2004.
 The Solanaceae Genomics Network. Copyright (c) 2004.
 All Rights Reversed.

Please report bugs to lam87@cornell.edu.


=cut
    

    
use strict;
use Getopt::Std;
use vars qw($opt_s $opt_m $opt_i $opt_v $opt_h $opt_p $opt_D $opt_L);

# opt_p : protein file.
# opt_s : summary only. Don't generate messages during run.
# opt_m : do not report minor problems
# opt_i : identifiers only. Print affected identifiers.
# opt_v : print version info
# opt_h : print help
# opt_D : diplay sequence length distribution

# constants
#
my $maxnamelength = 50;
my $maxlinelength = 100000; # this is not the length of an entry, just the length of a line 
                            # (of which an entry can have unlimited)
# variable declarations
#
my $lineno = 0;
my $dirtylines = 0;
my $length = 0;  # so first entry doesn't appear to be empty
my $linetoolongcount = 0;
my $sequences_not_checked=0;
my $entryname = "";
my $oldentryname = "";
my $nametoolongcount = 0;
my $zerolengthcount = 0;    # looks at length of a sequence entry
my $zerolinelengthcount =0; # looks at the length of a individual line
my $severeproblemcount = 0;
my $entryProblemCount;
my %entries;
my $severity;
my $duplicatedIds = 0;
my $what;
my $len;
my $good;
my @seqstats;
my $totalentries=0;
my $totallength=0;
my $longest_length=0;
my $shortest_length=0;
my $longest_seq="";
my $shortest_seq="";
my $length_cutoff=0; # used for option L with the length cutoff 
my $longer_seq_count=0; #sequences longer than length_cutoff
my $shorter_seq_count=0; #sequences shorter than length_cutoff

getopts('psmivhDL:');

if ($opt_v) { print_version(); exit(); }

if ($opt_h) { print_help(); exit(); }
 

my $filename = shift;

open (F, "<$filename") || die "Can't open $filename. Execution terminated. Please try again.\n";

print STDERR "Checking file... \n";
							      
my $line;
							      
while (<F>) {
   chomp;
   $line = $_;
   check_line($line);
   
}

process_entry($line);

close (F);

# quit with option L without giving the summary, as this would mess up the tab delimited output 
# when it is used in sorting etc.
if ($opt_L) { 
    print STDERR "# >= $opt_L: $longer_seq_count\n";
    print STDERR "# <  $opt_L: $shorter_seq_count\n";
    exit(0); 
}

print "\nSummary\n*******\n\n";
print "This file contained the following errors:\n";
if ($opt_m) {
    print "- [Minor errors ignored (option -m)]\n";
}
else { 
    print "- $dirtylines lines with minor sequence problems\n";
}
print "- $severeproblemcount lines with major sequence problems\n";
print "- $nametoolongcount entries where the seq name is too long\n";
print "- $zerolengthcount entries that lack sequence info.\n";
print "- $linetoolongcount lines that are over $maxlinelength long.\n";
print "- $zerolinelengthcount lines that are emtpy [may be trailing]\n";
print "- $duplicatedIds duplicated Ids.\n";
print "- $sequences_not_checked sequences were not checked [too long]\n\n";
print "- Total length: $totallength\n";
print "- Sequence entries: $totalentries\n";
print "- Average Sequence length: "; printf "%10.2f", ($totallength/$totalentries);
print "\n";
print "- Longest Sequence: $longest_length\n";
print "- Shortest Sequence: $shortest_length\n";
print "\n";

my $exit_code = 0;

if ($severeproblemcount || $zerolengthcount) { print "This file should be checked and repaired before further use. \n\n"; $exit_code=-1;  }
elsif ((!$severeproblemcount) && !($nametoolongcount) && (!$zerolengthcount)) { print "This file should be usable.\n\n"; }

if ($opt_D) {
  print "Sequence Size Distribution:\n\n";
  my $i=0;
  my @compressed =();
  my %label = ();
  my $divs = 20;
  my $increment = @seqstats/$divs;

  for (my $i=0; $i<@seqstats; $i++) {
      $compressed[$i/$increment]+=$seqstats[$i];
  }
  my $max = 0;
  foreach my $i (@compressed) {
      if ($max < $i) { $max= $i; }
  }
  my $maxwidth = 60;
  my $factor=$maxwidth/$max;

#  print "max=$max. factor=$factor\n";
  for (my $i=0; $i<@compressed; $i++) {
      my $size=$i*$increment;
      printf "%6d",  ($size); print " ["; printf "%6d", "$compressed[$i]"; print "]";
      print " ";
      for (my $n=0; $n<($compressed[$i] * $factor); $n++) {
	  print "*";
      }
      print "\n";
  }
}
 

 
      
print "\nDone.\n\n";

exit($exit_code);

sub checkseq 
{     
    $what = $_[0];
    $len = length($what);
    if ($len > 32000) { return 3; }
    if ($what =~ /[ATGCatgcNn]{$len}/) 
    {
	return 0;
    }
    else
    {
	$_ =  $what;
	    $good = tr/[ATGCatgcNn]/x/;
	if ($good/$len > 0.8) { return 1;  }
	else { return 2; }
    }
}


sub checkproteinseq {
    $what = $_[0];
    $len = length($what);
    if ($len > 32000) { return 3; }
    if ($what =~ /[ACDEFGHIKLMNPQRSTVWXYacdefghiklmnpqrstvwxy\*]{$len}/)
    {
	return 0;
    }
    else
    {
	$_ =  $what;
	$good = tr/[ACDEFGHIKLMNPQRSTVWXYacdefghiklmnpqrstvwxy\*]/x/;
	if ($good/$len > 0.8) { return 1;  }
	else { return 2; }
    }
}

sub message {
    my $message = shift;
    # if 'summary only' is not on, we print everything
    if (!$opt_s) {
	# check if 'identifiers only' (opt_i) option is set
	if ($opt_i) {
	    # print the identifier only once if it contains several problems
	    if ($entryProblemCount <= 1)  { 
		print "$entryname ($severity)\n";
	    }
	}
	else { print $message; }
    }   
}
    
sub print_version {
    print "\nfastaFileCheck Version 2.3 April 2, 2004.\n";
    print "Copyright (c) 2001-2004 by Lukas Mueller,\n";
    print "The Arabidopsis Information Resource - TAIR, and \n";
    print "The Carnegie Institution of Washington, Department of Plant Biology.\n\n";
}

sub print_help {
    print "\nfastaFileCheck\n\nHelp\n====\n\nOptions:\n";
    print "-p : protein file.\n";
    print "-s : summary only. Don't generate messages during run.\n";
    print "-m : do not report minor problems.\n";
    print "-i : identifiers only. Print sequence identifiers of error containing entries.\n";
    print "-v : print version info.\n";
    print "-h : print help (these lines, actually).\n";
    print "-D : display sequence length distribution.\n";
    print "-L : print out a tab delimited file with the lengths of the sequences and their ids (no other output).\n\n";


}

sub hashValueAscendingNum {
   $a <=> $b;
}

sub check_line {
    $lineno++;
    if (length($line) == 0) {
	message("Problem at line $lineno: Line has zero length!\n");
       $zerolinelengthcount++;
       $severeproblemcount++;
   }
   if (($line =~ /^>/)) { 


       process_entry($line);
     
 }
   else { 
       $length += length($line);   
       if ($opt_p) { $severity = checkproteinseq($line); }
       else {
	   $severity = checkseq($line);
       }
       if ($severity >0)  { 
	   if (($severity ==1)) {
	       # do we show minor problems? check m option
	       if (!$opt_m) {
		   # if minor problem and option 'ignore minor problem' not set, print it 
		   message("Problem at line \# $lineno :\n$line\n\n");
		   $dirtylines++; 
		   $entryProblemCount++;
	       }
	   }
	   elsif ($severity ==2) { 
	       message("Severe problem at line \#$lineno: \n$line\n\n");
	       $severeproblemcount ++;
	       $entryProblemCount++;
	   }
	   elsif ($severity == 3) {
	       message("Sequence entry too long to be checked: $lineno $entryname\n");
	       $sequences_not_checked++;
	   }
	   else { 
	       message ("Unknown problem at line \# $lineno. Severity: $severity.\n\n"); 
	   }
       }
       else {
	   
       } 
       $severity = 0;
   }       
}

sub process_entry {
    my $line = shift;
    $totalentries++;
     $totallength+=$length;
     if ($length>$longest_length) { $longest_length = $length; }
     if (($length<$shortest_length) || ($shortest_length==0)) { $shortest_length = $length; }
     if (($length ==0) && ($lineno!=1)) { 
	 $entryProblemCount++;
	 message ("Problem at entry $entryname (line# $lineno) No sequence found!!! \n\n"); 
	 $zerolengthcount++; 
	 $severeproblemcount++;
     }
     if (length($line)>$maxlinelength) { 
	 message("Problem at line $lineno: Line too long (".length($line).">$maxlinelength)\n"); 
	 $linetoolongcount++;
     }
     if ($opt_D) {
	 $seqstats[$length]++;
     }
     if ($opt_L) {
	 print "$entryname\t$length\n";
	 if ($length >= $opt_L) { $longer_seq_count++; }
	 if ($length < $opt_L) { $shorter_seq_count++; }
     }
     
     $length = 0;
     $entryProblemCount = 0;
     $oldentryname = $entryname;
     $entryname = $line;
     $entryname =~ s/\>(.*?)\s(.*)/$1/;
     # check if entries are duplicated.
     if (exists ($entries{$entryname})) { 
	 message("$entryname is duplicated\n");
	 $duplicatedIds++;
     }
     else {
	 $entries{$entryname}++; 
     }
     
     if (length($entryname) > $maxnamelength) { 
	 message("Problem at entry $entryname (line# $lineno): Name too long!\n"); $nametoolongcount++; 
     }
}




