#!/usr/bin/perl
use strict;
use CXGN::DB::InsertDBH;
my $dbh = CXGN::DB::InsertDBH::connect({dbhost=>'hyoscine', dbname=>'cxgn', dbuser=>'postgres'});
my ($create) = @_;

if($create eq "create"){
    eval{create_table($dbh)};
    if($@){
	print "Error creating table:\n";
	print "$@\n";
	$dbh -> rollback();
    }
    else{
	print "Success creating table!";
	$dbh -> commit();
    }
}
eval{insert_enzymes($dbh)};
if($@){
    print "Error populating table:\n";
    print "$@\n";
    $dbh->rollback();
}
else{
    print "Success populating table!\n";
    $dbh->commit();
}

#creates table enzyme_restriction_sites
sub create_table{
    my ($dbh) = @_;
    $dbh -> do("SELECT utils.drop_cxgn_branch_views_for_table('sgn', 'markers')");
    $dbh -> do("set search_path = sgn_bt");
    $dbh -> do("CREATE TABLE enzyme_restriction_sites(enzyme_id INTEGER, restriction_site TEXT)");
    $dbh -> do("SELECT utils.create_cxgn_branch_views_for_table('sgn', 'markers')");
    $dbh -> do("SELECT utils.make_all_rows_visible('sgn', 'markers')");
}

#makes a hash table with each enzyme name as the key to an array of all its possible restriction sites
sub populate_hash{
    my %enzyme_sites = ();
    open (my $file_stuff, "/home/emily/restriction_enzymes.txt") or print "File not found\n";
    while (my $line = <$file_stuff>){
	chomp($line);
	$line =~ /^(.*)\t(.*)$/;
	my $enzyme = $1;
	my $symbolic_sequence = $2;
	my $sequence_array = all_possible_sites($symbolic_sequence);#an array of all possible DNA sequences
	$enzyme_sites{$enzyme} = $sequence_array;
    }
    return \%enzyme_sites;
}

#returns an array of all possible restriction sites
sub all_possible_sites{
    my ($original_site) = @_;
    if ($original_site !~ m/^.*\(\d*\/\d*\).*$/ #sequence doesn't already have numbers indicating cutting site
    && $original_site !~ s/ /^/){ #change the symbol for cut from " " to "^", if possible
	$original_site .= "^"; #The cut is at the end of the sequence
    }
    my @fixed_sites = ();
    all_possible_sites_helper($original_site, \@fixed_sites);
    return \@fixed_sites;
}

#recursive helper method for all_possible_sites
sub all_possible_sites_helper{
    my ($original_site, $fixed_sites) = @_;
    if($original_site =~ m/^(.*)N(.*)$/){
	my $option1 = $1."A".$2;
	my $option2 = $1."C".$2;
	my $option3 = $1."T".$2;
	my $option4 = $1."G".$2;
	if (has_possibilities($option1)){
	    all_possible_sites_helper($option1, $fixed_sites); #get rid of the remaining places with alternatives
	    all_possible_sites_helper($option2, $fixed_sites); #if there is a decison place for option1, there will also be one for the other options
	    all_possible_sites_helper($option3, $fixed_sites);
	    all_possible_sites_helper($option4, $fixed_sites);
	}
	else{
	    push @$fixed_sites, $option1;
	    push @$fixed_sites, $option2;
	    push @$fixed_sites, $option3;
	    push @$fixed_sites, $option4;
	}
    }
    #some sequences have an place where a letter can be one of two things
    #for example CC^(A/T)GG 
    elsif ($original_site =~ m/^(.*)\(([[:alpha:]])\/([[:alpha:]])\)(.*)$/){
	#in this example, $1 is "CC^", $2 is "A", $3 is "T", $4 is "GG"
	my $option1 = $1.$2.$4;
	my $option2 = $1.$3.$4;
	if (has_possibilities($option1)){
	    all_possible_sites_helper($option1, $fixed_sites);
	    all_possible_sites_helper($option2, $fixed_sites);
	}
	else{
	    push @$fixed_sites, $option1;
	    push @$fixed_sites, $option2;
	}
    }
    #some sequences have places with three alternatives
    elsif ($original_site =~ m/^(.*)\(([[:alpha:]])\/([[:alpha:]])\/([[:alpha:]])\)(.*)$/){
	my $option1 = $1.$2.$5;
	my $option2 = $1.$3.$5;
	my $option3 = $1.$4.$5;
	if(has_possibilities($option1)){
	    all_possible_sites_helper($option1, $fixed_sites);
	    all_possible_sites_helper($option2, $fixed_sites);
	    all_possible_sites_helper($option3, $fixed_sites);
	}
	else{
	    push @$fixed_sites, $option1;
	    push @$fixed_sites, $option2;
	    push @$fixed_sites, $option3;
	}    
    }
    else{
	push @$fixed_sites, $original_site;
    }
    return $fixed_sites;
}

#returns true if there are any places in the given sequence which contain (N/N) or (N/N/N) format
sub has_possibilities{
    my ($site) = @_;
    return ($site =~ m/^(.*)\(([[:alpha:]])\/([[:alpha:]])\)(.*)$/)
 	 ||($site =~ m/^.*\([[:alpha:]]\/[[:alpha:]]\/[[:alpha:]]\).*$/)
	 ||($site =~ m/^(.*)N(.*)$/);
}

#gets an enzyme's id number out of the enzymes table
sub get_enzyme_id{
    my ($dbh, $enzyme_name) = @_;
    $dbh -> do("set search_path = 'sgn_bt'");
    my $name_id_hashref = $dbh -> selectall_hashref("SELECT enzyme_name, enzyme_id FROM enzymes", "enzyme_name");
    if(!exists $name_id_hashref->{$enzyme_name}){
	my $sth = $dbh -> prepare("INSERT INTO enzymes(enzyme_name) VALUES (?)");
	$sth -> execute($enzyme_name);
	$dbh -> commit();
    }
    my $updated_name_id_hashref = $dbh -> selectall_hashref("SELECT enzyme_name, enzyme_id FROM enzymes", "enzyme_name");
    return $updated_name_id_hashref->{$enzyme_name}{enzyme_id};
}

#puts all of the enzymes and their restriction sites into enzyme_restriction_sites
sub insert_enzymes{
    my ($dbh) = @_;
    my $enz_hash_ref = populate_hash();
    my %enzyme_ids = ();
    foreach my $e(keys(%$enz_hash_ref)){
	my $id = get_enzyme_id($dbh, $e);
	$enzyme_ids{$id} = $enz_hash_ref ->{$e};
    }
    my $formatted_enz = join ", ", keys(%enzyme_ids);
    $dbh -> do("set search_path=sgn_bt,utils");
    my $enz_in_table = $dbh -> selectall_hashref("SELECT enzyme_id, restriction_site FROM enzyme_restriction_sites WHERE enzyme_id IN($formatted_enz)", "enzyme_id");
    my @update_enzymes = ();
    my @insert_enzymes = ();
    foreach my $e (keys(%enzyme_ids)){
	if(exists $enz_in_table->{$e}){
	    my $rest_seq_in_table = $enz_in_table -> {$e}{restriction_site};
	    if($rest_seq_in_table eq ""){ #the enzyme is in the table without a restriction sequence
		push @update_enzymes, $e;
	    }
	    else{
		#print "There is already a restriction_site for the enzyme $e.  It is: $rest_seq_in_table\n";
	    }
	}
	else{ #the enzyme is not in the table yet
	    push @insert_enzymes, $e;
	}   
    }
    foreach my $enz_id (@update_enzymes){
	my $seq_array_ref = $enzyme_ids{$enz_id};
	for(my $i = 0; $i<@$seq_array_ref; $i++){
	    my $seq = "\'$seq_array_ref->[0]\'";
	    if($i==0){
		my $sth = $dbh -> prepare("UPDATE enzyme_restriction_sites SET \"restriction_site\" = ? WHERE enzyme_id = ?");
		$sth -> execute($seq, $enz_id);
	    }
	    else{
		my $sth = $dbh -> prepare("INSERT INTO enzyme_restriction_sites(enzyme_id, restriction_site) VALUES (?,?)");
		$sth -> execute($enz_id, $seq);
	    }
	}
    }
    foreach my $enz_id (@insert_enzymes){
    	my $seq_array_ref = $enzyme_ids{$enz_id};
	foreach my $seq(@$seq_array_ref){
	    my $sth = $dbh -> prepare("INSERT INTO enzyme_restriction_sites(enzyme_id, restriction_site) VALUES (?,?)");
	    $sth -> execute($enz_id, $seq);
	}
    }
}
