#!/usr/local/bin/perl -w

# extract_cpg_seq.pl
# Maureen Liu, Sanger Institute, May/2005
# Use Ensembl API to extract CpG island sequences for methylation assay
# Input: list of coordinates (gene name, start, end), tab deliminated
# Output: sequence of island plus 100 bp up- and down-stream
# Check/update database info in first section before each use

# locate Bioperl and Ensembl API
use lib '/nfs/disk100/pubseq/PerlModules/Ensembl/www_38_1/bioperl-live';
use lib '/nfs/disk100/pubseq/PerlModules/Ensembl/www_38_1/ensembl/modules';

use Bio::Seq;
use Bio::SeqIO;
use Bio::EnsEMBL::DBSQL::DBAdaptor;

# input and output files===============================================

# Decide database to use
print "Please select database:
1. Vega human
2. Ensembl mouse
3. Ensembl chimp\n";
$selection = <STDIN>;
chomp $selection;

if ($selection==1) {
  $current_db = "vega_homo_sapiens_ext_20051013_v38_NCBI36";
  $database = "hs:VEGA";}
elsif ($selection==2) {
  $current_db = "mus_musculus_core_38_35";
  $database = "mm:NCBI35";}
elsif ($selection==3) {
  $current_db = "pan_troglodytes_core_40_3c";
  $database = "pan:v1.0";}
else {die "Please select a valid database number.\n";}

# Get the coordinates file
print "What's the coordinates file: "; 
$coord = <STDIN>;
chomp $coord;
open (COORDFILE, "$coord") or die "Can't open coordinates file: $coord\n";

# Output sequence file
$seqIO = Bio::SeqIO->new(-file => ">sequence.txt", 
                            -format => "fasta" );

# extract coordinates from coordinates file
@coord = <COORDFILE>;
close COORDFILE;

# Connect to the relevant database using DBAdaptor
if ($selection==1) {
  $db = new Bio::EnsEMBL::DBSQL::DBAdaptor (
                       -user   => "ensro",
                       -dbname => "$current_db",
                       -host   => "ecs3f",
		       -port   => "3310"); } # vega uses diffrent parameters

else {
  $db = new Bio::EnsEMBL::DBSQL::DBAdaptor (
                       -user   => "anonymous",
                       -dbname => "$current_db",
                       -host   => "ensembldb.ensembl.org"); }

# get the slice adaptor
my $slice_adaptor = $db->get_SliceAdaptor();

# fetch chromosome x
$slice = $slice_adaptor->fetch_by_region('chromosome', 'x');

# obtain the sequence for each set of coordinates
foreach $coord (@coord) 
{
 my @a = split (/\t/, $coord); # 0=id, 1=start, 2=end
 chomp $a[2];
 
 # take 100 bp up&downstream (for cpg island extraction only)
 my $start = $a[1]; #-100;
 my $end = $a[2]; #+100;
 
 # extract the sequence
 # my $sequence = $slice->subseq($start, $end);
 
 my $seq = Bio::Seq->new (-seq=>$slice->subseq($start, $end),
                          -display_id=>"$a[0]:$database:X:$start:$end",
			  -alphabet=>"dna");
 
 # output to sequence file
 $seqIO->write_seq($seq);
}

exit;
