
=head1 NAME

automatic annotation.pl - a script that produces automatic annotations from the pubsearch database.

=head1 DESCRIPTION

the script takes a gene name as a parameter. 

The program calculates the following values:

(1) Average specificity of gene name occurrence in the articles.

For each article in which the gene name occurs, the number of occurrences of any other gene names are counted. The ratio of gene name occurences over number of other gene names that occur is the specificity of the article. An average with standard deviation is computed from the data.

(2) a list of co-occuring annotation terms is generated. Each annotation term is divided by the number of articles it has been found in.

Term and gene matches in articles should be distributed according to a poisson distribution. An evalue can be calculated for the observed matches.

=cut

use strict;
use DBI;

#my $genename = shift;

my $dbh = DBI->connect("dbi:mysql:host=localhost;database=pubsearchdb","lukas","arabidopsis", { RaiseError => 1 } );

#my $pub_gene = Pubgene->new($dbh, $genename);

my $pub_gene_factory = Pubgene_factory->new($dbh);

foreach my $pub_gene ($pub_gene_factory->get_all_genes()) { 
    print "\nGENE: ".$pub_gene->get_name()."\n\n";
    my @article_ids =  $pub_gene->get_article_ids();
    my $specificity_sum = 0;
    foreach my $a (@article_ids) { 
	my $s =  $pub_gene->get_specificity($a);
	print "Specificity for article $a: $s\n";
	$specificity_sum += $s;
    }
    my $average_specificty = "ND";
    if (@article_ids) { $average_specificty=$specificity_sum/@article_ids; }
    print "Average specificity: $average_specificty\n";
    print "\nTerms:\n\n";
    
    my $h = $pub_gene->get_terms();

    foreach my $k (keys %$h) { 
	my $total_term_ratio = $pub_gene->get_total_term_ratio($k);
	my $score = ($$h{$k}/@article_ids)*$average_specificty/$total_term_ratio;
	if ($score>5) {
	    print $pub_gene->get_name()." TERM: $k FREQUENCY: $$h{$k}  SCORE: $score\n";
	}
    }
}
print "\nDone.\n";

package Pubgene_factory;

sub new { 
    my $class = shift;
    my $args={};
    my $self = bless $args, $class;
    $self->set_dbh(shift);
    return $self;
}

sub get_all_genes { 
    my $self = shift;
    my $q = "SELECT name FROM pub_gene";
    my $sth = $self->get_dbh()->prepare($q);
    $sth->execute();
    my @pub_genes=();
    while (my ($name)=$sth->fetchrow_array()) { 
	push @pub_genes, Pubgene->new($self->get_dbh(), $name);
    }
    return @pub_genes;
}

=head2 function get_dbh

  Synopsis:	
  Arguments:	
  Returns:	
  Side effects:	
  Description:	

=cut

sub get_dbh { 
    my $self=shift;
    return $self->{dbh};
}

=head2 function set_dbh

  Synopsis:	
  Arguments:	
  Returns:	
  Side effects:	
  Description:	

=cut

sub set_dbh { 
    my $self=shift;
    $self->{dbh}=shift;
}




package Pubgene; 

sub new { 
    my $class = shift;
    my $args = {};
    my $self = bless $args, $class;
    $self->set_dbh(shift);
    $self->set_name(shift);
    return $self;
}

sub get_distinct_articles { 
    my $self = shift;
    
    my $q = "SELECT count(distinct(pub_article.article_id)) 
             FROM pub_gene 
             JOIN pub_term on (pub_gene.id=pub_term.pub_gene_id) 
             JOIN pub_hit on (pub_term.pub_hit_id=pub_hit.id)
             JOIN pub_article on (pub_hit.pub_article_id=pub_article.id)
             WHERE (pub_hit.is_valid <> 'n' || pub_hit.is_valid IS NULL) AND name like ?";
    my $sth=$self->get_dbh()->prepare($q);
    $sth->execute($self->get_name());
    my ($count) = $sth->fetchrow_array();
    return $count;
}

sub get_article_ids { 
    my $self = shift;
    my $q = "SELECT pub_article.id 
             FROM pub_gene 
             JOIN pub_term on (pub_gene.id=pub_term.pub_gene_id) 
             JOIN pub_hit on (pub_term.id=pub_hit.pub_term_id)
             JOIN pub_article on (pub_hit.pub_article_id=pub_article.id)
             WHERE (pub_hit.is_valid <> 'n' || pub_hit.is_valid IS NULL) AND pub_gene.name like ? GROUP BY pub_article.id";
    my $sth=$self->get_dbh()->prepare($q);
    $sth->execute($self->get_name());
    my @articles = ();
    while (my ($article_id) = $sth->fetchrow_array()) { 
	push @articles, $article_id;
    }
    return @articles;
}

=head2 function get_specificity()

  Synopsis:	
  Arguments:	an article id
  Returns:	the number of occurences of the gene divided 
                by the number of occurrences of other genes
  Side effects:	
  Description: 

=cut

sub get_specificity { 
    my $self = shift;
    my $article_id = shift;
    
    # get the number of other genes that occur in the
    # articles linked to this gene
    #
    my $q = "SELECT count(distinct(pub_gene.id))
             FROM pub_hit
             JOIN pub_term on (pub_hit.pub_term_id=pub_term.id)
             JOIN pub_gene on (pub_term.pub_gene_id=pub_gene.id)
             WHERE (pub_hit.is_valid <> 'n' || pub_hit.is_valid IS NULL) AND pub_hit.pub_article_id=? AND pub_gene.name !=?";
    my $sth = $self->get_dbh()->prepare($q);
    $sth->execute($article_id, $self->get_name());
    my ($count) = $sth->fetchrow_array();
    
    # 
    my $r = "SELECT pub_hit.match_number FROM pub_gene
               JOIN pub_term on (pub_gene.id=pub_term.pub_gene_id)
               JOIN pub_hit on (pub_term.id=pub_hit.pub_term_id)
              WHERE (pub_hit.is_valid <> 'n' || pub_hit.is_valid IS NULL)  AND pub_hit.pub_article_id=? and pub_gene.name like ?";
    $sth = $self->get_dbh()->prepare($r);
    $sth->execute($article_id, $self->get_name());
    my ($match_number) = $sth->fetchrow_array();

    # match number is not populated anymore... 
    # quick fix to make the code still work
    #
    $match_number = 1;

    if ($count) { return ($match_number/$count); }
    else { return undef; }
}

=head2 function get_terms()

  Synopsis:	
  Arguments:	none
  Returns:	a hash with keys as terms and the number of 
                occurance as the hash values.
  Side effects:	
  Description:	

=cut

sub get_terms { 
    my $self = shift;
    my $q = "SELECT terms.name, count(*)
               FROM pub_gene 
               JOIN pub_term on (pub_gene.id=pub_term.pub_gene_id)
               JOIN pub_hit on (pub_term.id=pub_hit.pub_term_id)
               JOIN pub_article on (pub_hit.pub_article_id=pub_article.id)
               JOIN pub_hit as termhit on (termhit.pub_article_id=pub_article.id)
               JOIN pub_term as terms on (termhit.pub_term_id=terms.id)
              WHERE (pub_hit.is_valid !='n' || pub_hit.is_valid IS NULL)  AND terms.type !='gene'
                AND pub_gene.name=?
             GROUP BY terms.id";

    #warn ("Running query: $q\n");
    my $sth = $self->get_dbh()->prepare($q);
    $sth -> execute($self->get_name());
    my $hashref = {};
    while (my ($key, $value) = $sth->fetchrow_array()) { 
	print STDERR "$key, $value\n";
	$$hashref{$key}=$value;
    }
    return $hashref; 
}
   
=head2 function get_total_term_ratio()

  Synopsis:	
  Arguments:	
  Returns:	
  Side effects:	
  Description:	

=cut

sub get_total_term_ratio { 
    my $self = shift;
    my $term_name = shift;
    my $q = "SELECT count(distinct(pub_hit.pub_article_id))
             FROM pub_term  JOIN pub_hit ON (pub_term.id=pub_hit.pub_term_id)
               WHERE (pub_hit.is_valid !='n' || pub_hit.is_valid IS NULL) AND pub_term.name like ?";
    my $sth = $self->get_dbh()->prepare($q);
    $sth->execute($term_name);
    my ($total_term_count) = $sth->fetchrow_array();
    
    my $r = "SELECT count(*) from pub_article";
    $sth = $self->get_dbh()->prepare($r);
    $sth->execute();
    my ($total_articles) = $sth->fetchrow_array();
    
    return ($total_term_count/$total_articles);
}




=head2 function get_name()

  Synopsis:	
  Arguments:	
  Returns:	
  Side effects:	
  Description:	

=cut

sub get_name { 
    my $self=shift;
    return $self->{name};
}

=head2 function set_name()

  Synopsis:	
  Arguments:	
  Returns:	
  Side effects:	
  Description:	

=cut

sub set_name { 
    my $self=shift;
    $self->{name}=shift;
}

=head2 function get_dbh

  Synopsis:	
  Arguments:	
  Returns:	
  Side effects:	
  Description:	

=cut

sub get_dbh { 
    my $self=shift;
    return $self->{dbh};
}

=head2 function set_dbh

  Synopsis:	
  Arguments:	
  Returns:	
  Side effects:	
  Description:	

=cut

sub set_dbh { 
    my $self=shift;
    $self->{dbh}=shift;
}

