#!/usr/local/bin/perl
#run from linux machine
#/nfs/team19/by1/phd/perl/hgdp2sweep perl hgdp2sweep.pl -infile_name xxx -outfile_name xxx
use strict;
use warnings;

use Getopt::Long;

my ($infile_name,$outfile_name);

GetOptions('infile_name=s' => \$infile_name,
	   'outfile_name=s' => \$outfile_name,
	  );

#(-f $infile_name && -f $outfile_name) or die <<USAGE;
#    Usage: $0
#                -infile_name    <HapMap file>
#                -outfile_name   <filename to contain snp positions and gtypes>
#
#USAGE

if (!$infile_name) {
  print "You need infile_name, such as :
-infile_name xxx -outfile_name xxx\n";
}

my %rec_base = ('A',1,'C',2,'G',3,'T',4,'N',5);
my (%rec_child);

open CHI, "children_sid.txt" or die "Can't open childrens file $!\n";

while (<CHI>) {
  chomp;
  my ($child1, $child2) = split ;
  $rec_child{$child1}=1;
  $rec_child{$child2}=1;
}

open IN, "$infile_name" or die "Can't open input name: $!\n";
open OUT, ">$infile_name\.phase_inp";
open OUT1, ">$outfile_name\.snp";
open OUT2, ">$outfile_name\.emphase";

my (%ind_id, %ind_name, %rec_snp);

while (<IN>) {
  if (/^rs\#/) {
    my $i=1;
    my @all = split;
    my @ind_names = splice (@all,3);
    foreach my $ind (@ind_names) {
      $ind_id{$i} = $ind;
      $ind_name{$ind}=$i;
      $i++;
    }
  }
  elsif (/^rs\d+/) {
    my $h={};
    my %ind_gtype;
    my $i=1;
    my @all = split;
    my $rsname = $all[0];
    my ($chr) = $all[1] =~ /Chr(\d+)/;
    my $pos = $all[2];

    my @ind_gtypes = splice (@all,3);
    $h->{'rsname'} = $rsname;
    $h->{'chr'} = $chr;
    $h->{'pos'} = $pos;
    foreach my $gtype (@ind_gtypes) {
      $ind_gtype{$i}{$rsname} = $gtype;
      $i++;
    }
    $h->{'ind_gtype'} = \%ind_gtype;
    $rec_snp{$rsname} = $h;
  }
}

my (@ind_names,@pos,@types);

foreach my $ind_name (keys %ind_name) {
  if (!$rec_child{$ind_name}) {
    push @ind_names, $ind_name;
  }
}

my $valid_ind = scalar @ind_names;

my @hs = values %rec_snp;

@hs = sort {$a->{'pos'}<=>$b->{'pos'}} @hs;

print OUT1 "snpid\tchr\tHG17\n";

foreach my $h (@hs) {
  print OUT1 $h->{'rsname'},"\t",$h->{'chr'},"\t",$h->{'pos'},"\n";
}

my $num_snps = scalar @hs;

print OUT "$valid_ind\n$num_snps\nP\t";

foreach my $h(@hs) {
  push @pos, $h->{'pos'};
  push @types, 'S';
}
print OUT join "\t",@pos,"\n";
print OUT join "\t",@types,"\n";


foreach my $ind_name (@ind_names) {
  my (@base1,@base2);
  if (!($rec_child{$ind_name})) {
    my $i = $ind_name{$ind_name};
    foreach my $h (@hs) {
      my $rsname = $h->{'rsname'};
      my $ind_gtype = $h->{'ind_gtype'};
      my %ind_gtype = %$ind_gtype;

      my $gt = $ind_gtype{$i}{$rsname};
      my ($base1, $base2) = split '', $gt;
      $base1 =~ s/n/?/i;
      $base2 =~ s/n/?/i;
      push @base1,$base1;
      push @base2,$base2;
    }
    print OUT "#$ind_name\n";
    print OUT join "\t",@base1,"\n";
    print OUT join "\t",@base2,"\n";
  }
}

##run phase###
my $phase_command = "/nfs/team19/by1/phd/phase.2.1.1.linux/PHASE";
system("$phase_command $infile_name\.phase_inp $infile_name\.phase_out");

##parse phase output file###

if (-e "$infile_name\.phase_out" and ! -z "$infile_name\.phase_out") {
  open PHASE, "$infile_name\.phase_out" or die "$infile_name\.phase_out can't be opened:$!";
}

my ($ind_name,$best,$name,%ind_hap);

while (<PHASE>) {
  if (/^BEGIN BESTPAIRS1$/) {
    $best=1;
  }
  elsif ($best and /^0\s+\#(.*)$/) {
    $ind_name = $1;
    $name=1;
  }
  elsif ($best and $name) {
   my $line = $_;
   $line =~ s/[\(\)\[\]]*//g;
   my @a = split /\s+/, $line;
   push @{$ind_hap{$ind_name}},\@a;
  }
  elsif (/^END BESTPAIRS1$/) {
    undef $best;
    undef $name;
    last;
 }
}

foreach my $ind_name (keys %ind_hap) {
  my ($a,$a1) = @{$ind_hap{$ind_name}};
  my @a = @$a;
  my @a1 = @$a1;
  my (@base1,@base2);
  foreach my $base (@a) {
    push @base1, $rec_base{$base};
  }
  foreach my $base (@a1) {
    push @base2, $rec_base{$base};
  }
  unshift @base1,"$ind_name\tT";
  unshift @base2,"$ind_name\tU";
  print OUT2 join "\t",@base1,"\n";
  print OUT2 join "\t",@base2,"\n";
}
