#!/usr/local/bin/perl -w

# Maureen Liu, Sanger Institute, Oct/2008
# CGI_compare_2.pl
#
# Aim: extract desired CpG island sequences from a list 
# 
# Input:
# 1. tab delimited file of desired island names
# 2. lists of island sequences in human, mouse and opossum
#
# Output: fasta sequences of homologous islands, mouse and opossum islands have the name 
#   of the orthologous human gene attached for ease of later alignment analysis
#

use strict;
use Bio::Seq;
use Bio::SeqIO;

# Input CGI name file
my $in = 'CGI_homol_names.txt';
open (IN, "$in") or die "Can't open coordinates file: $in\n";

# Input sequence files
my $human_in = Bio::SeqIO->new(-file => "CGI_seq_human_081018.seq", 
                             -format => "fasta" );
my $mouse_in = Bio::SeqIO->new(-file => "CGI_seq_mouse_081022.seq", 
                             -format => "fasta" );
my $opossum_in = Bio::SeqIO->new(-file => "CGI_seq_opossum_081018.seq", 
                             -format => "fasta" );

# Output cpg sequence file
my $seqout = Bio::SeqIO->new(-file => ">CGI_homol.seq", 
                             -format => "fasta" );

# extract CGI names
my @names = <IN>;
close IN;

# load input sequences into hashs

my $seq;
my (%human,%mouse,%opossum) = ();

while ( $seq = $human_in->next_seq() ) {
  my $seqname = $seq->display_id();
  $seqname =~ s/:CGI//;
  my $sequence = $seq->seq();
  if ( exists $human{$seqname} ) { $seqname = $seqname . '_2'; }
  $human{$seqname} = $sequence;
}

while ( $seq = $mouse_in->next_seq() ) {
  my $seqname = $seq->display_id();
  $seqname =~ s/:CGI//;
  my $sequence = $seq->seq();
  if ( exists $mouse{$seqname} ) { $seqname = $seqname . '_2'; }
  $mouse{$seqname} = $sequence;
}

while ( $seq = $opossum_in->next_seq() ) {
  my $seqname = $seq->display_id();
  $seqname =~ s/:CGI//;
  my $sequence = $seq->seq();
  if ( exists $opossum{$seqname} ) { $seqname = $seqname . '_2'; }
  $opossum{$seqname} = $sequence;
}


# fetch desired sequences

while ( my $name = shift @names ) {

  # get CGI names
  chomp $name;
  my @species = split (/\t/, $name);
  my $opossum = $species[0]; my $human = $species[1]; my $mouse = $species[2];
  my $opossumseq = $opossum{$opossum};

  # write human seq
  &writeseq($human,$human{$human},'human');
  my $test = $human . '_2';
  if ( exists $human{$test} ) { &writeseq($test,$human{$test},'human'); }
  
  # write mouse seq
  &writeseq($mouse,$mouse{$mouse},'mouse');
  $test = $mouse . '_2';
  if ( exists $mouse{$test} ) { &writeseq($test,$mouse{$test},'mouse'); }

  # write human seq
  &writeseq($opossum,$opossum{$opossum},'opossum');
  $test = $opossum . '_2';
  if ( exists $opossum{$test} ) { &writeseq($test,$opossum{$test},'opossum'); }

}

sub writeseq {
  # output cpg sequences to sequence file 
  my ($id,$sequence,$species) = @_;
  $id = $id . ':CGI:' . $species;
  my $seq = Bio::Seq->new (-seq=>$sequence,
                           -display_id=>$id,
	                   -alphabet=>"dna");
  $seqout->write_seq($seq);
}

