#!/usr/bin/perl -w
use strict;
use DBI;
use DB_File;

if (!$ARGV[0] || $ARGV[0] eq "help") {
  print <<EOF;

  Program to unpack the .bdb (berkeley database) files of chromatograms
  and translate the interim SGN names (eg: cLED-1-A1.TH.A) back to TIGR
  names. Why do this? 

  (1) the new SGN better handles different nomenclature systems, and prefers
      to know the sequencing facility ID for the chromatograms

  (2) The bdb system was created because the filesystem was not very good
      at handling large directories (directories with tens of thousands of
      relatively small files). Newer, stable filesystems for Linux are now
      available, such as XFS and reiserfs and eliminate this problem. It is
      thus desireable to store the files now directly in the filesystem and
      eliminate the bdb complexity. [ NOTE: these filesystems are not yet
      in use on amatxu at time of writing this, but we are unpacking them
      anyway ]

  How it is done:

  A list of .bdb files is read from STDIN (use find). The index is
  read and old SGN (tomato) is queried for name translation back to TIGRs
  identifers.

  Usage: <output directory>

EOF
exit(-1);
}

my ($output_dir) = @ARGV;

my $dbh = DBI->connect("dbi:mysql:host=localhost;database=tomato","koni","bitchbadass");

my $idq = $dbh->prepare("SELECT facility_id from id_map where sgn_id=?");

my $backupq = $dbh->prepare("SELECT tigr_id from clone_id_map where sgn_id=?");

open UNMATCHED, ">/tmp/unmatched-sgn-ids.tdv"
  or die "Failed to open unmatched-sgn-ids.tdv file ($!)";

my %map;
keys(%map) = 1 << 20;
open KEYFILE, "<mapping-list.txt"
  or die "Failed to open keyfile \"mapping-list.txt\" ($!)";

while(<KEYFILE>) {
  chomp;
  my ($facility, $sgn) = split;
  $map{$sgn} = $facility;
}

close KEYFILE;

my ($used_idq, $used_backupq) = (0,0);
while(my $bdb_filename = <STDIN>) {
  chomp $bdb_filename;
  my ($library_code) = $bdb_filename =~ m/^([^\-]+)/;
  system("mkdir -p $output_dir/$library_code");

  print STDERR "Processing file $bdb_filename....";
  my %cgrams = ();
  my $database = tie(%cgrams, 'DB_File', "$bdb_filename", O_RDONLY, 0666, 
		     $DB_HASH)
    or die "Failed to bind databsae file \"$bdb_filename\" ($!)";

  my ($sgn_id, $tigr_id);
  foreach $sgn_id ( keys %cgrams ) {

    if (!defined($map{$sgn_id})) {
      $used_idq++;
      $idq->execute($sgn_id);
      if ($idq->rows == 0) {
	$backupq->execute($sgn_id);
	$used_backupq++;
	if ($backupq->rows == 0) {
	  print STDERR "No entry for $sgn_id\n";
	  print UNMATCHED "$sgn_id\n";
	  next; 
	}
	($tigr_id) = $backupq->fetchrow_array();
      } elsif ($idq->rows > 1) {
	print STDERR "More than one entry for $sgn_id\n";
	next;
      } else {
	($tigr_id) = $idq->fetchrow_array();
      }
    } else {
      $tigr_id = $map{$sgn_id};
    }

    open F, "| bzip2 -dc - | gzip -c - > $output_dir/$library_code/${tigr_id}.gz";
    print F $cgrams{$sgn_id};
    close F
      or print STDERR "Error closing pipe: ($!) ($?)\n";

  }

  undef $database;
  untie %cgrams;
  
  print STDERR "done\n";
}

close UNMATCHED;

print "idq: $used_idq\n";
print "backupq: $used_backupq\n";
