#!/usr/bin/perl

use strict;
use warnings;
use Data::Dumper;
use Getopt::Std;

our %opts;
getopts('d:o:r:g:', \%opts);

our $replica;
our $ref_err = 0;

if (!defined $opts{d}) {
	usage();
	exit;
} else {
	$replica = read_replicate($opts{r}) if defined $opts{r};
	my $snp_pool = read_all_sts($opts{d});
	$snp_pool = consensus($snp_pool);
	output($opts{o}, $snp_pool) if defined $opts{o};
	exit;
}

#end
#-----------------------------------------------------------------------

sub read_replicate
{
	my $replicate_file = shift;
	open FH, $replicate_file or die "$replicate_file: $!\n";
	my @list = <FH>;
	close FH;

	my %replica;
	foreach my $line (@list) {
		chomp $line;
		my ($e1, $e2) = split /\s+/, $line;
		$replica{$e1} = $e2;
	}
	return \%replica;
}

sub output
{
	print "start output\n";

	if ($ref_err) {
		print "*** ref inconsistence found. ***\n"; 
		print "output abort.\n";
		return;
	}

	my ($out_fn, $snp_pool) = @_;
	my $output;

	foreach my $pos (keys %$snp_pool) {
		foreach my $name (keys %{$snp_pool->{$pos}}) {
			$output->{$name}->{$pos} = $snp_pool->{$pos}->{$name};
		}
	}

	my @locus = sort {$a <=> $b} keys %$snp_pool;
	my @names = sort {$a cmp $b} keys %$output;

	open FH, ">", $out_fn or die "$out_fn: $!\n";

	print FH "\n";
	print FH "\t", join("\t\t", map {"Ref base:"} @locus), "\n";
	print FH "\t", join("\t\t", map {"Ref base: ".$_} @locus), "\n";
	print FH "\t", join("\t\t", map {"STS base: ".$_} @locus), "\n";

	foreach my $name (@names) {
		my $DNA_name = $name;
		$DNA_name =~ s/-.*// if defined $replica;
		my @line = ($DNA_name);
		foreach my $pos (@locus) {
			my $genotype = $output->{$name}->{$pos};
			if (defined $genotype) {
				if ($genotype =~ /[+-\/]/) {
					$genotype = $genotype."\t".$genotype;
				} elsif ($genotype =~ /^(.)(.)$/) {
					$genotype = "$1\t$2";
				} else {
					$genotype = "*\t*";
				}
				push @line, $genotype;
			} else {
				push @line, "?\t?";
			}
		}
		print FH join("\t", @line), "\n";
	}
	close FH;

#	for (my $n=1; @locus>0; $n++) {
#		my @positions = ();
#		if (@locus >= 125) {
#			map {push @positions, shift @locus;} (1 .. 125);
#		} else {
#			@positions = @locus;
#			@locus = ();
#		}
#
#		$out_fn =~ s/(_\d)?\.txt//;
#		$out_fn .= '_'.$n.'.txt';
#		open FH, ">", $out_fn or die "$out_fn: $!\n";
#
#		print FH "\n";
#		print FH "\t", join("\t\t", map {"Ref base:"} @positions), "\n";
#		print FH "\t", join("\t\t", map {"Ref base: ".$_} @positions), "\n";
#		print FH "\t", join("\t\t", map {"STS base: ".$_} @positions), "\n";
#
#		foreach my $name (@names) {
#			my $DNA_name = $name;
#			$DNA_name =~ s/-.*// if defined $replica;
#			my @line = ($DNA_name);
#			foreach my $pos (@positions) {
#				my $genotype = $output->{$name}->{$pos};
#				if (defined $genotype) {
#					if ($genotype =~ /[+-\/]/) {
#						$genotype = $genotype."\t".$genotype;
#					} elsif ($genotype =~ /^(.)(.)$/) {
#						$genotype = "$1\t$2";
#					} else {
#						$genotype = "*\t*";
#					}
#					push @line, $genotype;
#				} else {
#					push @line, "?\t?";
#				}
#			}
#			print FH join("\t", @line), "\n";
#		}
#		close FH;
#	}

	print "output done\n-\n";
}

sub consensus
{
	print "start merging\n";
	my $snp_pool = shift;

	foreach my $pos (sort {$a <=> $b} keys %$snp_pool) {
		foreach my $name (sort {$a cmp $b} keys %{$snp_pool->{$pos}}) {
			my $snp = $snp_pool->{$pos}->{$name};

			my @fn = keys %$snp;
			my $file = shift @fn;
			my $nt = $snp->{$file};

			while (my $f = shift @fn) {
				if ($snp->{$file} ne $snp->{$f}) {
					print join("\t", "inconsistence between sts:", $name, $pos, $file, $snp->{$file}, $f, $snp->{$f}), "\n";
					$nt .= '/'.$snp->{$f};
				}
			}

			$snp_pool->{$pos}->{$name} = $nt;
		}
	}

	print "merging done\n-\n";
	return $snp_pool;
}

sub read_sts
{
	my ($fn, $snp_pool) = @_;
	print "reading $fn\n";
	my $n = 0;

	open FH, "<", $fn or die "$fn: $!\n";
	my @content = <FH>;
	close FH;

	my @ref_snp;
	my %names;

	foreach my $line (@content) {
		next if $line !~ /stSG/i or $line =~ /Unmatched/i;
		$n++;

		chomp $line;
		my @fields = split /\t/, $line, -1;
		my ($sam_name, $ref_name, $dir) = @fields[1,2,3];
		map {shift @fields} (1 .. 13);
		my @mutations = @fields;
		#($ref_name, $dir) = $ref_name =~ /.*_(\w+\d+|chimp)[^_]*_(F|R)\.SCF/i;
		#($sam_name, $dir) = $sam_name =~ /.*_(\w+\d+|chimp)[^_]*_(F|R)\.SCF/i;
		($ref_name, $dir) = $ref_name =~ /.*_(.+)_(F|R)(\.SCF)?/;
		($sam_name, $dir) = $sam_name =~ /.*_(.+)_(F|R)(\.SCF)?/;

		if (!defined $ref_name) {
			my $m = $n + 1;
			die join("\t", "non-standard reference sample name:", $fn, 'line '.$m), "\n";
		}
		if (!defined $sam_name) {
			my $m = $n + 1;
			print join("\t", "non-standard sample name:", $fn, 'line '.$m), "\n";
			next;
		}

		$ref_name =~ s/T19-//;
		$sam_name =~ s/T19-//;
		$ref_name =~ s/\s//;
		$sam_name =~ s/\s//;
		$names{$sam_name} = 1;

		foreach my $i (0 .. $#mutations) {
			next if $mutations[$i] eq '';
			next if $mutations[$i] eq "\015";
			my ($pos, $ref_nt, $sam_nt) = $mutations[$i] =~ /^"?(-?\d+)([ACGT]+)>([ACGT]+)/;
			unless (defined $pos and defined $ref_nt and defined $sam_nt) {
				my $j = $i + 1;
				print join("\t", "non-standard input:", $fn, $sam_name, 'line '.$j), "\n";
				next;
			}

			$ref_nt .= $ref_nt if length $ref_nt == 1;
			$sam_nt .= $sam_nt if length $sam_nt == 1;

			$ref_snp[$i] ||= { pos => $pos, nt => $ref_nt};
			if ($ref_snp[$i]->{pos} != $pos or $ref_snp[$i]->{nt} ne $ref_nt) {
				print join("\t", "ref inconsistence:", $fn, $sam_name, $pos), "\n";
				$ref_err = 1;
			} else {
				$snp_pool->{$pos}->{$ref_name}->{$fn}->{$dir} ||= $ref_nt;
			}

			$snp_pool->{$pos}->{$sam_name}->{$fn}->{$dir} = $sam_nt;
		}
	}

	foreach my $ref (@ref_snp) {
		next unless defined $ref;
		my $pos = $ref->{pos};
		my $nt = $ref->{nt};

		foreach my $name (keys %names) {
			my $snp = $snp_pool->{$pos}->{$name}->{$fn};
			if (!defined $snp) {
				$snp_pool->{$pos}->{$name}->{$fn} = $nt;
			} elsif (!defined $snp->{F}) {
				$snp_pool->{$pos}->{$name}->{$fn} = $snp->{R};
			} elsif (!defined $snp->{R}) {
				$snp_pool->{$pos}->{$name}->{$fn} = $snp->{F};
			} elsif ($snp->{F} ne $snp->{R}) {
				print join("\t", "forward reverse inconsistence:", $fn, $name, $pos, $snp->{F}, $snp->{R}), "\n";
				$snp_pool->{$pos}->{$name}->{$fn} = $snp->{F}.'+'.$snp->{R};
			} else {
				$snp_pool->{$pos}->{$name}->{$fn} = $snp->{F};
			}
		}

#		check consistency between replicate
		next unless defined $replica;
		foreach my $name (keys %$replica) {
			if (!defined $snp_pool->{$pos}->{$name}
					or !defined $snp_pool->{$pos}->{$name}->{$fn}) {
				delete $snp_pool->{$pos}->{$name};
				next;
			}
			my $snp1 = $snp_pool->{$pos}->{$name}->{$fn};
			my $snp2 = $snp_pool->{$pos}->{$replica->{$name}}->{$fn};
			if (!defined $snp2) {
				$snp_pool->{$pos}->{$replica->{$name}}->{$fn} = $snp1;
			} elsif ($snp1 ne $snp2) {
				print join("\t", "replicate inconsistence:", $fn, $pos, $name, $snp1, $replica->{$name}, $snp2), "\n";
#				print "replicate inconsistence: in $fn at loci $pos between sample $name $snp1 & ".$replica->{$name}." $snp2\n";
				$snp_pool->{$pos}->{$replica->{$name}}->{$fn} = $snp1.'-'.$snp2;
			}
			delete $snp_pool->{$pos}->{$name};
		}

	}

	print "$n samples read\n-\n";
	return $snp_pool;
}

sub read_all_sts
{
	my $dir = shift;
	my $snp_pool;

	opendir DH, $dir or die "$dir: $!\n";
	while (my $fn = readdir DH) {
		next unless $fn =~ /^stSG.*txt$/;
		$snp_pool = read_sts("$dir/$fn", $snp_pool);
	}
	closedir DH;
	return $snp_pool;
}

sub usage
{
	die<<USAGE;

Usage : $0 -d <dir> [-r] <replicate> [-o] <output>
-d : dir that contains snp reports of each sts to be merged
-r : file that contains replicate list, in the form of 'PIG0093\\tNA11840\\n'
-o : output file

USAGE
}
