#!/usr/bin/perl -w use strict; ########################################################################################## ########################################################################################## my @files; my $fasta; my $bowtieout; my $ebwt; my $target; my $parsebowtie; my $single; my $left; my $right; my $fileout; my $files1; my @files1; my $mode; my $file1; my $file2; my $fold = 10; my $min_cov = 2; my $trim5 = 0; my $trim3 = 0; my $processors = 1; ########################################################################################## # command line requires base name of the transcriptome files e.g. \*.fasta # # NB Script requires unity between transcriptome and raw data file names # FOR paired end:::: NAME.fasta, NAME.L.fastq NAME.R.fastq # FOR unpaired :::: NAME.fasta, NAME.fastq ########################################################################################## ########################################################################################## # get suffix of files and mode (paired or unpaired) ########################################################################################## for (my $i=0; $i<=$#ARGV; ++$i) { if($ARGV[$i] eq "-in") { # -in = input files i.e. output of bwt if(($i+1)>($#ARGV)) { exit_error1(); } $files1 = ($ARGV[$i+1]); $i++; } if($ARGV[$i] eq "-mode") { # -mode = paired or unpaired data if(($i+1)>($#ARGV)) { exit_error1(); } $mode = ($ARGV[$i+1]); $i++; } if($ARGV[$i] eq "-fold") { # -fold is how many fold enrichment of real over others is required to keep if(($i+1)>($#ARGV)) { exit_error1(); } $fold = ($ARGV[$i+1]); $i++; } if($ARGV[$i] eq "-min_cov") { # -fold is how many fold enrichment of real over others is required to keep if(($i+1)>($#ARGV)) { exit_error1(); } $min_cov = ($ARGV[$i+1]); $i++; } if($ARGV[$i] eq "-trim5") { # -fold is how many fold enrichment of real over others is required to keep if(($i+1)>($#ARGV)) { exit_error1(); } $trim5 = ($ARGV[$i+1]); $i++; } if($ARGV[$i] eq "-trim3") { # -fold is how many fold enrichment of real over others is required to keep if(($i+1)>($#ARGV)) { exit_error1(); } $trim3 = ($ARGV[$i+1]); $i++; } if($ARGV[$i] eq "-proc") { # -fold is how many fold enrichment of real over others is required to keep if(($i+1)>($#ARGV)) { exit_error1(); } $processors = ($ARGV[$i+1]); $i++; } } @files1 = glob $files1; unless (scalar @files1 > 0 ) { print "no data files present. Check spelling!\n\n\n\n"; &exit_error; } unless ($mode eq 'u' or $mode eq 'p') { print "-mode not set (specify -mode p (paired) or -mode u (unpaied)\n\n\n\n"; &exit_error; } ########################################################################################## # base files into new array ########################################################################################## print "mode = $mode\n"; foreach (@files1) { $_ =~ s/.fasta//; push @files, $_; } ########################################################################################## # Run bowtie build on transcript files if needed. ########################################################################################## foreach $file1 (@files){ $fasta = $file1 . ".fasta"; $bowtieout = $file1 . ".bowtie"; $ebwt = $bowtieout . ".1.ebwt"; unless (-e $ebwt) { #system "/Users/maxtelford/othersbin/bowtie-1.1.1/bowtie-build --offrate 3 $fasta $bowtieout"; system "bowtie-build --offrate 3 $fasta $bowtieout"; } else { print "skipping build stage for $fasta\n"; } } ########################################################################################## # Set up required filenames and for each transcriptome file, run each of the raw datasets # against it to count how many raw reads from all the spp hit each hit each seq ########################################################################################## foreach $file1 (@files){ print "\n\n\n"; $fasta = $file1 . ".fasta"; print "fasta = $fasta\n"; $target = $file1 . "vs" . $file1 . '.out'; print "target = $target\n"; $parsebowtie = "\\\*vs$file1.out"; print "parsebowtie = $parsebowtie\n"; $bowtieout = $file1 . ".bowtie"; foreach $file2 (@files){ $single = $file2 . ".fastq"; print "single = $single\n"; $left = $file2 . ".L.fastq"; print "left = $left\n"; $right = $file2 . ".R.fastq"; print "right = $right\n"; $fileout = $file2 . 'vs' . $file1 . '.out'; print "fileout = $fileout\n"; ########################################################################################## # Run mode paired (p) or unpaired (u) run bowtie differently. ########################################################################################## unless (-e $fileout) { print "Running bowtie $file2 versus $file1\n"; if ($mode eq 'u') { #system "/Users/maxtelford/othersbin/bowtie-1.1.1/bowtie -p $processors -a --trim5 $trim5 --trim3 $trim3 --suppress 1,2,4,5,6,7,8 $bowtieout $single > $fileout\n\n"; system "bowtie -p $processors -a --trim5 $trim5 --trim3 $trim3 --suppress 1,2,4,5,6,7,8 $bowtieout $single > $fileout\n\n"; } elsif ($mode eq 'p') { #system "/Users/maxtelford/othersbin/bowtie-1.1.1/bowtie -p $processors -a --trim5 $trim5 --trim3 $trim3 --suppress 1,2,4,5,6,7,8 $bowtieout -1 $left -2 $right > $fileout\n\n"; system "bowtie -p $processors -a --trim5 $trim5 --trim3 $trim3 --suppress 1,2,4,5,6,7,8 $bowtieout -1 $left -2 $right > $fileout\n\n"; } else { print "please indicate paired end \"p\" or unpaired \"u\" data\n"; exit; } ########################################################################################## # parse output file and summarise to reduce file size to "seq_name \t count" ########################################################################################## print "reducing output\n"; my %seqshash=(); my $seq; open (FILE, $fileout) || die; # while () { chomp; if ($_ =~ m/\w/i) { if (exists $seqshash{$_}) { $seqshash{$_}++; } else { $seqshash{$_} = 1; } } } close FILE; ########################################################################################## # overwrite original output file ########################################################################################## open (NEWFILE, ">$fileout"); foreach $seq (keys %seqshash) { print NEWFILE $seq, "\t", $seqshash{$seq}, "\n"; } close NEWFILE; %seqshash = (); } } ########################################################################################## # Run 'parsebowtie.pl which selects those sppX assembled seqs for which sppX raw data hit # more often than sppY or sppZ etc. ########################################################################################## system "parsebowtie.pl -in $parsebowtie -transcripts $fasta -target $target -fold $fold -min_cov $min_cov"; } ########################################################################################## #Exit error ########################################################################################## sub exit_error { die"\n *************************************** \tBT2.pl *************************************** Prog to run bowtie (http://bowtie-bio.sourceforge.net/index.shtml) on several assembled transcriptomes deriving from single illumina track with potential cross contamination. Runs bowtie-build for each set of assembled transcripts. Runs bowtie for each set of raw illumina data against each assembled transcriptome. Runs separate script 'parsebowtie.pl' to identify those assembled seqs with highest hit to correct raw data files (by factor of 100 more than nearest wrong raw data seq) Saves good hits to 'clean' file and rejects those that do not pass this threshold (these end up in 'bad' file). Command line requires base name of the transcriptome files e.g. \*.fasta and indication of paired or unpaired raw data Usage = bt2.pl -in -mode

-fold -min_cov e.g. bt2.pl -in \\*.fasta -mode p -fold 10 -min_cov 5 NOTE \\*.fasta NOT \*.fasta!! NB Script requires unity between transcriptome and raw data file names: FOR paired end : NAME.fasta (assembled transcriptome seqs) NAME.L.fastq (raw illumina data LEFT) NAME.R.fastq (raw illumina data RIGHT) FOR unpaired : NAME.fasta (assembled transcriptome seqs) NAME.fastq (raw illumina data) To trim 5' or 3' nucleotides from the fastq reads (e.g. low quality) use -trim5 and -trim3 -trim5 20 (trims first 20) -trim3 50 (trims last 50) to use more than one processor use -proc e.g. -proc 10 "; }