#!/usr/local/bin/perl -w
#
# Maureen Liu notes: This version was downloaded from methtools website in 2007
#   and modified by Maureen Liu (Sanger Institute). All modifications are
#   accompanied by 'ML' notes.
#
################################################################################
# Requirements                                                                 #
################################################################################
#
require 5.002;
#
# External References
#
use diagnostics;
use strict;
use Carp;
use FileHandle;
use File::Path;
use File::Basename;
use Getopt::Std;
#
use vars qw( $opt_f $opt_d $opt_h );
#
################################################################################
# Package revision string                                                      #
################################################################################
#
my $prg       = basename($0);
my $rev       =       "1.00";
my $debug     =            0;
#
################################################################################
# Main program global variables                                                #
################################################################################
#
my @methylnam = ();
my @methylseq = ();
#
my $FILE      = FileHandle->new();
#
my %trans     = (
		 "aa"=>"a","at"=>"?","ag"=>"?","ac"=>"?","an"=>"n","a-"=>"-",
		 "ta"=>"?","tt"=>"t","tg"=>"?","tc"=>"?","tn"=>"n","t-"=>"-",
		 "ga"=>"?","gt"=>"?","gg"=>"g","gc"=>"?","gn"=>"n","g-"=>"-",
		 "ca"=>"?","ct"=>"c","cg"=>"?","cc"=>"C","cn"=>"n","c-"=>"-",
		 "na"=>"a","nt"=>"t","ng"=>"g","nc"=>"c","nn"=>"n","n-"=>"-",
		);
#
my $index  ;
my $index2 ;
my $pos    ;
my $seq    ;
my $name   ;
my $file   ;
my $base   ;
my $baseo  ;
my $basei  ;
my $pat    ;
my $pat1   ;
my $pat2   ;
my $cover  ;
my $meth_y ;
my $meth_n ;
my %freq   ;
my %stat   ;

    	 my $cg = 0; # ML variable
	 my $Cg = 0; # ML variable
	 my $dens = 0; # ML variable
	 my $position = -1; # ML variable
	 my $next = 0;

#
# MAN page
#
my $MAN = <<MAN;
NAME

     convert_bisulfite: conversion of bisulfite generated methylation data

AVAILABILITY

     Requires perl 5.002

DESCRIPTION

     The input format for this program consists in a single file containing 
     the mother-sequence followed by the bisulfite generated sequences in 
     FASTA format. All sequences have to be aligned and brought to equal 
     length to generate this input file ("file"). The alignment can be done 
     by hand or with a suitable program. 
     We recommend ClustalW (Thompson, 1994) using a modified conversion 
     matrix with equal probability of [c to c] and [c to t] exchanges. 
     The mother-sequence is supposed to have no internal gaps while "n" 
     are allowed to represent unknown bases. The program compares the 
     mother-sequence with the bisulfite generated sequences below in the 
     file "file" and generates individual files carrying the FASTA-name 
     followed by ".seq1". Cytosines in the  bisulfite sequences are converted 
     into capital "C" and thymines that are used to be cytosines in the 
     mother-sequence to lower case "c". All other bases are written in 
     lower case letters. As long as cytosine is not concerned, conflicts 
     between the sequences are resolved in favor of the untreated original. 
     In parallel, a second file is generated (FASTA-name.seq2) containing a 
     symbolized representation of the seq1-file: 5mC is written as "|", 
     unmethylated cytosine as "O" and other bases as "-". Unknown bases 
     are identified by a "?" and conflicts between mother-sequence and PCR 
     products as "C".
     The program writes a log-file to the standard output. We suggest to 
     redirect this output into a file for further use. The log-file contains 
     the FASTA-names of the processed sequences, a conversion statistics 
     and the position and nature of differences (other than C to T exchanges) 
     between the mother-sequence and the PCR products.
     Finally, a table is written into a file "file.tab". The first column
     contains the positions  from base 1 to the end of the sequence, the second
     column the nucleotide in the mother sequence, the next column
     the average methylation at cytosine sites in per cent, and the fourth 
     column the number of analyzed cytosines per position.

SYNOPSIS

     $prg [-f file] [-d] [-h] [>file.log]

OPTIONS

    -f string File name
    -d        Switchs to debug mode
    -h        Prints this message

EXIT STATUS

     The following perl-like exit stati are returned:

     0        Error
     1        Successful completion
 
AUTHOR(S)

     Ruben Schattevoy (schattev\@imb-jena.de)

CHANGE LOG

     When             Who  What

     Jul    6th, 1988 RS   Initial version
     Mar.  23rd, 2007 ML   sort clones by methylation density
     Jul.   2nd, 2008 ML   change to tab, only consider cpg

MAN
#
# Command line parsing 
#
getopts('f:dh') || croak($MAN);
#
# Check command line arguments
#
croak($MAN) if ( $opt_h || ! defined($opt_f) || $opt_f eq "" );
#
# Read input file and split into fasta sequences on the fly
#
open($FILE,$opt_f) || croak(sprintf("Cannot open file \"%s\"",$opt_f));
while ( <$FILE> ) {
    chomp;
    tr /A-Z/a-z/;
    if ( /^>(\S+)/ ) {
	push(@methylnam,$1);
	push(@methylseq,"");
    } else {
	$methylseq[-1] .= $_;
    }
}
close($FILE) || croak(sprintf("Cannot close open file \"%s\"",$opt_f));
#
# Error checking
#
if ( $#methylnam < 1 ) {
    croak(sprintf("Cannot find methylation data in input file"));
}
#
# Reset frequency counter array
# ML: Count cpg only. For original script, change cg to c.
#
foreach $base ( qw(cg Cg) ) {
    foreach $index2 ( 0..length($methylseq[0])-1 ) {
	$freq{$base}[$index2] = 0;
    }
}
#
foreach $index ( 1..$#methylnam ) {
    #
    # Error checking
    #
    unless ( length($methylseq[0]) == length($methylseq[$index]) ) {
	croak(sprintf("Inconsistent length in entry #%d of methylation data",
		      $index+1));
    }
    #
    $name = $methylnam[$index];
    $seq  = "";
    %stat = ();
    #
    printf("\nProcessing file \"%s\"\n",$name);
    #
    foreach $pos ( 0..length($methylseq[0])-1 ) {
	#
	$baseo = substr($methylseq[     0],$pos,1);
	$basei = substr($methylseq[$index],$pos,1);
	#
	$base  = $baseo . $basei;
	#
	$base  = exists($trans{$base}) ? $trans{$base} : undef;
	#
	if ( defined($base) ) {
	    unless ( $base eq "?" ) {
		$basei = $base;
	    } else {
		printf("Unexpected nucleotide combination \"%s:%s\" in input ".
		       "sequence at position \"%d\"\n",$baseo,$basei,$pos+1);
	    }
	} else {
	    printf("Unexpected nucleotide information \"%s:%s\" in input ".
		   "sequence at position \"%d\"\n",$baseo,$basei,$pos+1);
	    exit;
	}
	#
	$seq .= $basei;
	#
	$stat{$baseo,$basei}++;
    }
    #
    # Accumulate "c" and "C" frequency counters
    # ML: only consider cpg
    #
    foreach $index2 ( 0..length($methylseq[$index])-1 ) {
	$base = substr($seq,$index2,2);     # ML: check dinucleotide
#	$base = substr($seq,$index2,1);     # original script
	$freq{$base}[$index2]++ if ( exists($freq{$base}) );
    }
    #
    #========================================================================
    # Added by ML, Mar 2007
    # Count overall methylation and calculate density for each sequence
   
    	 $cg = 0; 
	 $Cg = 0; 
	 $dens = 0; 
	 $position = -1; 
	 
	 while ($seq)
	     {
	     $next = $position + 1;
	     $position = index ($seq,"Cg",$next);
	     if ($position == -1) {last};
	     $Cg++;
	     }
	 $position = -1;
	 
	 while ($seq)
	     {
	     $next = $position + 1;
	     $position = index ($seq,"cg",$next);
	     if ($position == -1) {last};
	     $cg++;
	     }
	     
	  $dens = ($Cg/($Cg+$cg)*100);
	  
	  # ML: round up if density is not an integer
	  if ( $dens =~ m/\D/ ) { $dens = int ($dens) + 1;  } 
	  
	  # format density prefix for seq1 filenames
	  if ($name =~ m/exp/) {$dens = "EXP";}
	  elsif ($dens == 100) {$dens = "ALL";}
	  elsif ($dens == 0) {$dens = "00";}
	  elsif ($dens<10) {$dens = "0"."$dens";}

    #========================================================================
    
    # Write sequence file in "atgcC"-format
    #
    $file = $dens . "_" . $name . ".seq1"; # ML: add density prefix to filename
    
    open($FILE,">" . $file) || croak(sprintf("Cannot open file \"%s\"",$file));
    printf $FILE (">%s\n",$file);
    printf $FILE ("%s\n",join("\n",grep($_,split(/(.{50})/,$seq))));
    close($FILE) || croak(sprintf("Cannot close open file \"%s\"",$file));
    #
    printf("\n");
    printf("%-4s : %4s %4s (Length = %d)\n","Base","C","[Cc]",length($seq));
    foreach $pat1 ( qw( a t g [Cc] n ) ) {
	printf("%-4s :",$pat1);
	foreach $pat2 ( qw( C [Cc] ) ) {
	    $pat = $pat2 . $pat1;
	    printf(" %4d",$seq =~ s/($pat)/$1/g);
	}
	printf("\n");
    }
    printf("\n");
    #
    # Write sequence file in "IO-? "-format
    #
    $seq  =~ s/cg/Ig/g;
    $seq  =~ s/Cg/Og/g;
    $seq  =~ s/n/X/g;
    $seq  =~ s/\-/ /g;
    $seq  =~ s/[atgc]/-/g;
    #
    $file = $dens . "_" . $name . ".seq2"; # ML: add density prefix to filename
    open($FILE,">" . $file) || croak(sprintf("Cannot open file \"%s\"",$file));
    printf $FILE (">%s\n",$file);
    printf $FILE ("%s\n",join("\n",grep($_,split(/(.{50})/,$seq))));
    close($FILE) || croak(sprintf("Cannot close open file \"%s\"",$file));
    #
    printf("%4s %4s %4s %4s %4s %4s %4s\n",qw( a t g c C n - ));
    #
    foreach $baseo ( qw( a t g c n ) ) {
	foreach $basei ( qw( a t g c C n - ) ) {
	    printf("%4d ",exists($stat{$baseo,$basei}) ? $stat{$baseo,$basei} : 0);
	}
	printf("%4s\n",$baseo);
    }
}
#
#========================================================================
# ML: for our purpose, only cpg methylation is considered
#
$file = $opt_f . ".tab";
#
open($FILE,">".$file) || croak(sprintf("Cannot open file \"%s\"",$file));
foreach $index2 ( 0..length($methylseq[0])-1 ) {
    $base = substr($methylseq[0],$index2,2);          # ML: dinucleotide
    $baseo = substr($methylseq[0],$index2,1);
    printf $FILE ("%4d %s",$index2+1,$baseo);
    if ( exists($freq{$base}) ) {
	$meth_y = $freq{$base}[$index2];
	$base   =~ tr/c/C/;
	$meth_n = $freq{$base}[$index2]-1;  # ML: minus exp seq
	#
	# counting number of analysed cytosines ("coverage")
	#
	$cover  = $meth_y+$meth_n;
	if ( $meth_y || $meth_n ) {
	    printf $FILE (" %3d",$cover);
	    printf $FILE (" %3d",$meth_n);  # ML: also record no. of 5mC
	    printf $FILE (" %3d",100.0*$meth_n/($meth_y+$meth_n));
	}
    }
    printf $FILE ("\n");
}
close($FILE) || croak(sprintf("Cannot close open file \"%s\"",$file));

#========================================================================
# ML: the original .tab file script

#$file = $opt_f . ".tab";
##
#open($FILE,">".$file) || croak(sprintf("Cannot open file \"%s\"",$file));
#foreach $index2 ( 0..length($methylseq[0])-1 ) {
#    $base = substr($methylseq[0],$index2,1);
#    printf $FILE ("%4d %s",$index2+1,$base);
#    if ( exists($freq{$base}) ) {
#	$meth_y = $freq{$base}[$index2];
#	$base   =~ tr/a-z/A-Z/;
#	$meth_n = $freq{$base}[$index2];
#	#
#	# counting number of analysed cytosines ("coverage")
#	#
#	$cover  = $meth_y+$meth_n;
#	if ( $meth_y || $meth_n ) {
#	    printf $FILE (" %3d",100.0*$meth_n/($meth_y+$meth_n));
#	    printf $FILE (" %3d",$cover);
#	}
#    }
#    printf $FILE ("\n");
#}
#close($FILE) || croak(sprintf("Cannot close open file \"%s\"",$file));
