#!/usr/bin/perl -w use strict; use Carp; =pod Thomas Keane, Fri May 02 12:12:20 BST 2008 @508 /Internet Time/ Pathogen Genomics, Sanger Institute, UK This is a script that takes SSAHA2 cigar format output of reads vs. a reference genome and creates Artemis output coverage plots from ONLY the uniquely aligned reads Code Licence: This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . =cut #check arguments if( @ARGV != 2 ) { print "Usage: cigar2Coverage.pl ssaha_output_file reference_genome_fasta\n"; exit; } my $cigar_file = $ARGV[ 0 ]; my $fasta_file = $ARGV[ 1 ]; if( ! -f $cigar_file || -z $cigar_file ){print "Cannot find ssaha output file or file is empty - please check\n";exit;} if( ! -f $fasta_file || -z $fasta_file ){print "Cannot find reference fasta file or file is empty - please check\n";exit;} print "Reading reference fasta file....\n"; my $ref = createHash( $fasta_file ); my %referenceFasta = %{ $ref }; my %coverageArrays_forward; my %coverageArrays_reverse; foreach( keys( %referenceFasta ) ) { my $key = $_; #strip any spaces from the reference headers (ssaha will not include them in the cigar output) if( $_ =~ /\s+/ ) { my @s = split( /\s+/, $_ ); $referenceFasta{ $s[ 0 ] } = $referenceFasta{ $_ }; delete( $referenceFasta{ $_ } ); $key = $s[ 0 ]; } #create the empty coverage arrays my @temp_fwd; my @temp_rev; my $contigLength = length( ( (split( /\n/, $referenceFasta{ $key } ) )[ 1 ] ) ); for( my $i = 0; $i < $contigLength; $i ++ ) { push( @temp_fwd, 0 ); push( @temp_rev, 0 ); } $coverageArrays_forward{ $key } = \@temp_fwd; $coverageArrays_reverse{ $key } = \@temp_rev; } print "Hashing ssaha cigar output...\n"; if( $cigar_file =~ /\.gz$/ ) { open( SSAHA_OUTPUT, "gunzip -c $cigar_file |" ) or die "Cant open ssaha2 output file: $!\n"; } else { open( SSAHA_OUTPUT, $cigar_file ) or die "Cant open ssaha2 output file: $!\n"; } my $processed = 0; my %ssaha_read_matches; my $ssaha_current_read = ""; my $ssaha_current_read_ref = []; while( ) { chomp; if( $_ =~ /^cigar:/ ) { my $line = $_; my @ssaha_splits = split( /\s+/, $line ); shift( @ssaha_splits ); $line = join(' ',@ssaha_splits); if( $ssaha_current_read eq "" ) { $ssaha_current_read = $ssaha_splits[ 0 ]; $ssaha_current_read_ref = []; push( @{ $ssaha_current_read_ref }, $line ); } elsif( $ssaha_current_read ne $ssaha_splits[ 0 ] ) { #at a changover point between two different read matches if( @{ $ssaha_current_read_ref } == 1 ) { #read has a single hit so include it my @hits = @{ $ssaha_current_read_ref }; my @splits = split( /\s+/, $hits[ 0 ] ); my $start; my $finish; if( $splits[ 5 ] > $splits[ 6 ] ) { $start = $splits[ 6 ] - 1; $finish = $splits[ 5 ] - 1; } else { $start = $splits[ 5 ] - 1; $finish = $splits[ 6 ] - 1; } if( $splits[ 3 ] eq '+' ) { for( my $i = $start; $i <= $finish ; $i ++ ) { $coverageArrays_forward{ $splits[ 4 ] }[ $i ] ++; } } else { for( my $i = $start; $i <= $finish ; $i ++ ) { $coverageArrays_reverse{ $splits[ 4 ] }[ $i ] ++; } } } $processed ++; print "Processed $processed reads...\n" unless $processed % 10000 != 0; $ssaha_current_read_ref = undef; $ssaha_current_read_ref = []; $ssaha_current_read = $ssaha_splits[ 0 ]; push( @{ $ssaha_current_read_ref } , $line ); } else { push( @{ $ssaha_current_read_ref } , $line ); } } } #put the entries for the last read in the hash table if( @{ $ssaha_current_read_ref } == 1 ) { #read has a single hit so include it my @hits = @{ $ssaha_current_read_ref }; my @splits = split( /\s+/, $hits[ 0 ] ); my $start; my $finish; if( $splits[ 5 ] > $splits[ 6 ] ) { $start = $splits[ 6 ] - 1; $finish = $splits[ 5 ] - 1; } else { $start = $splits[ 5 ] - 1; $finish = $splits[ 6 ] - 1; } if( $splits[ 3 ] eq '+' ) { for( my $i = $start; $i <= $finish ; $i ++ ) { $coverageArrays_forward{ $splits[ 4 ] }[ $i ] ++; } } else { for( my $i = $start; $i <= $finish ; $i ++ ) { $coverageArrays_reverse{ $splits[ 4 ] }[ $i ] ++; } } } close( SSAHA_OUTPUT ); print "Creating plot files....\n"; my $count = 0; foreach( keys( %coverageArrays_forward ) ) { open( OUT, ">".$_.".plot" ) or die "Cannot create output file\n"; my @t = @{ $coverageArrays_forward{ $_ } }; my @t1 = @{ $coverageArrays_reverse{ $_ } }; for( my $i = 0; $i < @t; $i ++ ) { print OUT $t[ $i ].' '.$t1[ $i ]."\n"; } close( OUT ); $count ++; } #a subroutine to take ssaha cigar output and group the records according to reads sub hash_ssaha_cigar_output { if( @_ != 1 ) { print "Usage: hash_ssaha_cigar_output ssaha_output_file"; exit; } my $ssaha_output = shift; if( $ssaha_output =~ /\.gz$/ ) { open( SSAHA_OUTPUT, "gunzip -c $ssaha_output |" ) or die "Cant open ssaha2 output file: $!\n"; } else { open( SSAHA_OUTPUT, $ssaha_output ) or die "Cant open ssaha2 output file: $!\n"; } my %ssaha_read_matches; my $ssaha_current_read = ""; my $ssaha_current_read_ref = []; while( ) { chomp; if( $_ =~ /^cigar:/ ) { my $line = substr( $_, 7 ); my @ssaha_splits = split( /\s+/, $line ); if( $ssaha_current_read eq "" ) { $ssaha_current_read = $ssaha_splits[ 0 ]; $ssaha_current_read_ref = []; push( @{ $ssaha_current_read_ref }, $line ); #push( @ssaha_current_read_matches, $_ ); } elsif( $ssaha_current_read ne $ssaha_splits[ 0 ] ) { #at a changover point between two different read matches my $tmp_ref = \@{ $ssaha_current_read_ref }; $ssaha_read_matches{ $ssaha_current_read } = $tmp_ref; $ssaha_current_read_ref = undef; $ssaha_current_read_ref = []; $ssaha_current_read = $ssaha_splits[ 0 ]; push( @{ $ssaha_current_read_ref } , $line ); } else { push( @{ $ssaha_current_read_ref } , $line ); } } } #put the entries for the last read in the hash table $ssaha_read_matches{ $ssaha_current_read } = $ssaha_current_read_ref; close( SSAHA_OUTPUT ); return \%ssaha_read_matches; } #a subroutine to create a hash table from a fastq OR fasta file #keys are the read/contig names #entries are the sequences and quality values sub createHash { croak "Usage: createHash file_name" unless @_ == 1; my $file = shift; if( ! (-f $file) ){croak "Cannot find file: $file\n";} if( ! (-s $file) ){croak "Empty file: $file\n";} my %reads_file; if( $file =~ /\.gz$/ ) { open( READS, "gunzip -c $file |" ) or die "Cannot open gzipped fastq file\n"; } else { open( READS, $file ) or die "Failed to open reads file"; } my $read_name = ; #first readname in first line of file chomp( $read_name ); my $read = "$read_name\n"; $read_name = substr( $read_name, 1, length( $read_name ) - 1 ); #remove the @/> sign my $line; while( ) #read down file until hit start of next read { chomp; $line = $_; if( $line =~ /^[\@|>].*/ ) #if hit start of next read { if( defined $reads_file{ $read_name } ) { #check the sequence entry is same as one that is already defined if( $reads_file{ $read_name } eq $read ) { print "WARNING: Read entry with identical sequence already exists in hash: $read_name\n"; } else { print "WARNING: Read entry with different sequence already exists in hash: $read_name\n"; } } else { #print "$read\n" unless $read_name ne "big961b12.p1k"; $reads_file{ $read_name } = $read; #enter into the hash } $read_name = substr( $line, 1, length( $line ) - 1 ); #remove the @/> sign - next read name is in the line variable $read = "$line\n"; } else { if( $line =~ /^\+/ ) { $read = $read."\n".$line; #add to info for the current read } else { $read = $read.$line; #add to info for the current read } } } close( READS ); #enter final value into the hash $reads_file{ $read_name } = $read; #enter into the hash #my @keys = keys %reads_file; #my $size = @keys; #print "Fastq file has $size reads\n"; return \%reads_file; }