Commit db585d44 authored by psimion's avatar psimion
Browse files

readme minor updates 2

parent 8e1edbcb
......@@ -113,10 +113,10 @@ cd CroCo_directory/utils
bash ./install_dependencies.sh --tool all --os ubuntu
```
Here to only install the mapper Kallisto on a MAC OS X:
Here to only install RapMap on a MAC OS X:
```bash
bash ./install_dependencies.sh --tool K --os macosx
bash ./install_dependencies.sh --tool R --os macosx
```
We recommend to install all these dependencies automatically even if some of these softwares are already installed on your computer. Indeed, this will assure you that you are using versions of these softwares that are compatible with the current version of CroCo. Do not worry, they will be installed only within CroCo's directory and will not affect in any way the use of the softwares already installed on your system.
......@@ -167,7 +167,7 @@ This script works under Ubuntu, Debian, Fedora, RedHat, CentOS and on Mac OS X.
Script usage :
```bash
./install_dependencies.sh --tool all|B|K|R --os ubuntu|debian|fedora|centos|redhat|macosx
./install_dependencies.sh --tool all|B|K|R|BL --os ubuntu|debian|fedora|centos|redhat|macosx
```
If you encounter problems during dependencies installation, take a look at the [Troubleshooting section](#troubleshooting) and at the `*_install.log` files created in the `utils/bin/` directory.
......
No preview for this file type
#!/bin/bash
SCRIPT_PATH=$(readlink -f "$0")
export PATH=$PATH:`dirname $SCRIPT_PATH`
# echo something in stderr
function errcho(){
>&2 echo $1
}
# check if a binary can be found in the PATH
# if not, warn and die
function checkBinPresence(){
if [[ "`which $1`" == "" ]]; then
errcho "$1 must be installed and in your PATH"
exit 1
fi
}
# checking bowtie presence
checkBinPresence bowtie
checkBinPresence bowtie-build
# now we launch the script and pass it all the arguments we received
crossconta-detection.pl $@
#!/usr/bin/perl -w
use strict;
##########################################################################################
##########################################################################################
my @files;
my $fasta;
my $bowtieout;
my $ebwt;
my $target;
my $parsebowtie;
my $single;
my $left;
my $right;
my $fileout;
my $files1;
my @files1;
my $mode;
my $file1;
my $file2;
my $fold = 10;
my $min_cov = 2;
my $trim5 = 0;
my $trim3 = 0;
my $processors = 1;
##########################################################################################
# command line requires base name of the transcriptome files e.g. \*.fasta
#
# NB Script requires unity between transcriptome and raw data file names
# FOR paired end:::: NAME.fasta, NAME.L.fastq NAME.R.fastq
# FOR unpaired :::: NAME.fasta, NAME.fastq
##########################################################################################
##########################################################################################
# get suffix of files and mode (paired or unpaired)
##########################################################################################
for (my $i=0; $i<=$#ARGV; ++$i) {
if($ARGV[$i] eq "-in") { # -in = input files i.e. output of bwt
if(($i+1)>($#ARGV)) { exit_error1(); }
$files1 = ($ARGV[$i+1]);
$i++;
}
if($ARGV[$i] eq "-mode") { # -mode = paired or unpaired data
if(($i+1)>($#ARGV)) { exit_error1(); }
$mode = ($ARGV[$i+1]);
$i++;
}
if($ARGV[$i] eq "-fold") { # -fold is how many fold enrichment of real over others is required to keep
if(($i+1)>($#ARGV)) { exit_error1(); }
$fold = ($ARGV[$i+1]);
$i++;
}
if($ARGV[$i] eq "-min_cov") { # -fold is how many fold enrichment of real
over others is required to keep if(($i+1)>($#ARGV)) { exit_error1(); }
$min_cov = ($ARGV[$i+1]); $i++;
}
if($ARGV[$i] eq "-trim5") { # -fold is how many fold enrichment of real over others is required to keep
if(($i+1)>($#ARGV)) { exit_error1(); }
$trim5 = ($ARGV[$i+1]);
$i++;
}
if($ARGV[$i] eq "-trim3") { # -fold is how many fold enrichment of real over others is required to keep
if(($i+1)>($#ARGV)) { exit_error1(); }
$trim3 = ($ARGV[$i+1]);
$i++;
}
if($ARGV[$i] eq "-proc") { # -fold is how many fold enrichment of real over others is required to keep
if(($i+1)>($#ARGV)) { exit_error1(); }
$processors = ($ARGV[$i+1]);
$i++;
}
}
@files1 = glob $files1;
unless (scalar @files1 > 0 ) {
print "no data files present. Check spelling!\n\n\n\n";
&exit_error;
}
unless ($mode eq 'u' or $mode eq 'p') {
print "-mode not set (specify -mode p (paired) or -mode u (unpaied)\n\n\n\n";
&exit_error;
}
##########################################################################################
# base files into new array
##########################################################################################
print "mode = $mode\n";
foreach (@files1) {
$_ =~ s/.fasta//;
push @files, $_;
}
##########################################################################################
# Run bowtie build on transcript files if needed.
##########################################################################################
foreach $file1 (@files){
$fasta = $file1 . ".fasta";
$bowtieout = $file1 . ".bowtie";
$ebwt = $bowtieout . ".1.ebwt";
unless (-e $ebwt) {
#system "/Users/maxtelford/othersbin/bowtie-1.1.1/bowtie-build --offrate 3 $fasta $bowtieout";
system "bowtie-build --offrate 3 $fasta $bowtieout";
}
else {
print "skipping build stage for $fasta\n";
}
}
##########################################################################################
# Set up required filenames and for each transcriptome file, run each of the raw datasets
# against it to count how many raw reads from all the spp hit each hit each seq
##########################################################################################
foreach $file1 (@files){
print "\n\n\n";
$fasta = $file1 . ".fasta"; print "fasta = $fasta\n";
$target = $file1 . "vs" . $file1 . '.out'; print "target = $target\n";
$parsebowtie = "\\\*vs$file1.out"; print "parsebowtie = $parsebowtie\n";
$bowtieout = $file1 . ".bowtie";
foreach $file2 (@files){
$single = $file2 . ".fastq";
print "single = $single\n";
$left = $file2 . ".L.fastq";
print "left = $left\n";
$right = $file2 . ".R.fastq";
print "right = $right\n";
$fileout = $file2 . 'vs' . $file1 . '.out';
print "fileout = $fileout\n";
##########################################################################################
# Run mode paired (p) or unpaired (u) run bowtie differently.
##########################################################################################
unless (-e $fileout) {
print "Running bowtie $file2 versus $file1\n";
if ($mode eq 'u') {
#system "/Users/maxtelford/othersbin/bowtie-1.1.1/bowtie -p $processors -a --trim5 $trim5 --trim3 $trim3 --suppress 1,2,4,5,6,7,8 $bowtieout $single > $fileout\n\n";
system "bowtie -p $processors -a --trim5 $trim5 --trim3 $trim3 --suppress 1,2,4,5,6,7,8 $bowtieout $single > $fileout\n\n";
}
elsif ($mode eq 'p') {
#system "/Users/maxtelford/othersbin/bowtie-1.1.1/bowtie -p $processors -a --trim5 $trim5 --trim3 $trim3 --suppress 1,2,4,5,6,7,8 $bowtieout -1 $left -2 $right > $fileout\n\n";
system "bowtie -p $processors -a --trim5 $trim5 --trim3 $trim3 --suppress 1,2,4,5,6,7,8 $bowtieout -1 $left -2 $right > $fileout\n\n";
}
else {
print "please indicate paired end \"p\" or unpaired \"u\" data\n";
exit;
}
##########################################################################################
# parse output file and summarise to reduce file size to "seq_name \t count"
##########################################################################################
print "reducing output\n";
my %seqshash=();
my $seq;
open (FILE, $fileout) || die; #
while (<FILE>) {
chomp;
if ($_ =~ m/\w/i) {
if (exists $seqshash{$_}) {
$seqshash{$_}++;
}
else {
$seqshash{$_} = 1;
}
}
}
close FILE;
##########################################################################################
# overwrite original output file
##########################################################################################
open (NEWFILE, ">$fileout");
foreach $seq (keys %seqshash) {
print NEWFILE $seq, "\t", $seqshash{$seq}, "\n";
}
close NEWFILE;
%seqshash = ();
}
}
##########################################################################################
# Run 'parsebowtie.pl which selects those sppX assembled seqs for which sppX raw data hit
# more often than sppY or sppZ etc.
##########################################################################################
system "parsebowtie.pl -in $parsebowtie -transcripts $fasta -target $target -fold $fold -min_cov $min_cov";
}
##########################################################################################
#Exit error
##########################################################################################
sub exit_error {
die"\n
***************************************
\tBT2.pl
***************************************
Prog to run bowtie (http://bowtie-bio.sourceforge.net/index.shtml) on several assembled transcriptomes deriving from single illumina track
with potential cross contamination.
Runs bowtie-build for each set of assembled transcripts.
Runs bowtie for each set of raw illumina data against each assembled transcriptome.
Runs separate script 'parsebowtie.pl' to identify those assembled seqs with highest hit to correct raw data files (by factor of 100
more than nearest wrong raw data seq)
Saves good hits to 'clean' file and rejects those that do not pass this threshold (these end up in 'bad' file).
Command line requires base name of the transcriptome files e.g. \*.fasta and indication of paired or unpaired raw data
Usage = bt2.pl -in <base_file> -mode <p or u for paired or unpaired> -fold <fold enrichment> -min_cov <minimum coverage to keep>
e.g. bt2.pl -in \\*.fasta -mode p -fold 10 -min_cov 5
NOTE \\*.fasta NOT \*.fasta!!
NB Script requires unity between transcriptome and raw data file names:
FOR paired end :
NAME.fasta (assembled transcriptome seqs)
NAME.L.fastq (raw illumina data LEFT)
NAME.R.fastq (raw illumina data RIGHT)
FOR unpaired :
NAME.fasta (assembled transcriptome seqs)
NAME.fastq (raw illumina data)
To trim 5' or 3' nucleotides from the fastq reads (e.g. low quality) use -trim5 and -trim3
-trim5 20 (trims first 20)
-trim3 50 (trims last 50)
to use more than one processor use -proc
e.g. -proc 10
";
}
#!/usr/bin/perl -w
use strict;
my $files;
my $numberoutput;
my %hash;
my $key;
my $file;
my %seqshash;
my @sorted;
my $seq;
my $target;
my $fold = 5;
my @ok_transcripts;
my $transcripts;
my $firstline;
my $aa_name;
my @aa_seq;
my $aa_seq;
my %aa_nameseq;
my @aa_spnames;
my $transcript;
my @bad_transcripts;
my $min_cov = 5;
my %totaliser1;
my %totaliser2;
my %totaliser3;
my @totalised;
my @equal_sized_hits;
my @UNequal_sized_hits;
my %NEWhash;
##########################################################################################
#get commands.
##########################################################################################
for (my $i=0; $i<=$#ARGV; ++$i) {
if($ARGV[$i] eq "-in") { # -in = input files i.e. output of bwt
if(($i+1)>($#ARGV)) { exit_error1(); }
$files = ($ARGV[$i+1]);
$i++;
}
if($ARGV[$i] eq "-target") { # -target = bowtie output file MEANT to have the correct transcripts
if(($i+1)>($#ARGV)) { exit_error1(); }
$target = ($ARGV[$i+1]);
$i++;
}
if($ARGV[$i] eq "-fold") { # -fold = fold enrichment required to be correct default = 10 fold
if(($i+1)>($#ARGV)) { exit_error1(); }
$fold = ($ARGV[$i+1]);
$i++;
}
if($ARGV[$i] eq "-transcripts") { # -transcripts file to be thinned.
if(($i+1)>($#ARGV)) { exit_error1(); }
$transcripts = ($ARGV[$i+1]);
$i++;
}
if($ARGV[$i] eq "-min_cov") { # -transcripts file to be thinned.
if(($i+1)>($#ARGV)) { exit_error1(); }
$min_cov = ($ARGV[$i+1]);
$i++;
}
}
##########################################################################################
#Check existence of various files before going any further.
##########################################################################################
my @query_files = glob $files;
unless (scalar @query_files > 0 ) {
print "no data files present. Check spelling!\n";
&exit_error;
}
unless (-e $transcripts) {
print "$transcripts does not exist in this directory\n";
&exit_error;
}
my $outputfile = $transcripts . ".clean";
my $bad_outputfile = $transcripts . ".bad";
unless (-e $target) {
print "$target does not exist in this directory. Check spelling???\n";
&exit_error;
}
##########################################################################################
#Read in bowtie output files to hash recording number of times each seq hit for each readfile
##########################################################################################
foreach $file (@query_files) {
print STDERR "reading file $file\n";
open (FILE, $file) || die; #
while (<FILE>) {
chomp;
if ($_ =~ m/\w/i) {
(my $name, my $value) = split '\t';
chomp $value;
chomp $name;
unless (exists $seqshash{$name}) {
$seqshash{$name} = $value;
}
else {
$seqshash{$name} = $seqshash{$name} + $value; #will give TOTAL hits to a seq across all 'omes
}
$hash{$file}{$name} = $value; #will give hits to a seq per spp
}
else {
next;
}
}
close FILE;
}
print STDERR "Filling in the blanks\n"; # cannot do comparison if value undefined
foreach $seq (keys %seqshash) {
foreach $file (@query_files) {
unless (exists $hash{$file}{$seq}) {
$hash{$file}{$seq} = 0;
}
}
}
##########################################################################################
#Sorting seqs first by number of total hits then between different fastq files
##########################################################################################
print STDERR "Sorting\n";
my @keys = sort { $seqshash{$a} <=> $seqshash{$b} } keys(%seqshash);
open (BAD, ">$bad_outputfile");
foreach $key (@keys) { # i.e. for each named sequence in turn
@sorted = ();
my @sorted = sort { $hash{$a}{$key} <=> $hash{$b}{$key} } keys(%hash); #sort the file according to number of hits and put 'file' into array in this order
if ($sorted[-1] eq $target and $hash{$sorted[-1]}{$key} > ($hash{$sorted[-2]}{$key}*$fold) and $hash{$sorted[-1]}{$key} > $min_cov) { #if 'correct' file has highest number of hits then keep it
push @ok_transcripts, $key;
# print "\n$key"; foreach (@sorted) {print " $_ $hash{$_}{$key} "};
}
else {
##########################################################################################
#Totaliser: adding up number of bad hits and how many TOTAL hits versus good and bad spp. place in three different totaliser hashes
##########################################################################################
unless ($sorted[-1] eq $target) {
if (exists $totaliser1{$sorted[-1]}) {
$totaliser1{$sorted[-1]}++; #number of affected seqs
$totaliser2{$sorted[-1]} = $totaliser2{$sorted[-1]} + $hash{$sorted[-1]}{$key}; #number of hits in WRONG seq (i.e. the spp it should NOT be from)
$totaliser3{$sorted[-1]} = $totaliser3{$sorted[-1]} + $hash{$target}{$key}; #number of hits in target (i.e. sp it OUGHT to be from)
}
else { #initiate hash entry if doesnt yet exist
$totaliser1{$sorted[-1]} = 1;
$totaliser2{$sorted[-1]} = $hash{$sorted[-1]}{$key};
$totaliser3{$sorted[-1]} = $hash{$target}{$key};
push @totalised, $sorted[-1]; #put these entries into array so can recall them later
}
##########################################################################################
#find subset of seqs with similar high number of hits in bad and good. Not expected of rare contam. May be general contam from outside source
##########################################################################################
if ($hash{$sorted[-1]}{$key}/($hash{$target}{$key}+1) <100 or $hash{$sorted[-1]}{$key} < 100) {
push @equal_sized_hits, $key;
}
else {
push @UNequal_sized_hits, $key;
if (exists $NEWhash{$sorted[-1]}){
$NEWhash{$sorted[-1]}++;
}
else {
$NEWhash{$sorted[-1]}=1;
}
}
}
##########################################################################################
#print out identity of bad seq, hits in bad spp and hits in good spp (target)
##########################################################################################
# unless ($sorted[-1] eq $target) {
# print "$sorted[-1] $hash{$sorted[-1]}{$key}\t";
# print "$hash{$target}{$key}\n";
# }
##########################################################################################
#print out seq name and for each spp how many hits there were send to BAD file
##########################################################################################
print BAD "\n$key";
foreach (@sorted) {
print BAD " $_ $hash{$_}{$key} ";
}
push @bad_transcripts, $key;
}
}
foreach $key (keys %NEWhash) {
print "nn $key $NEWhash{$key}\n";
}
##########################################################################################
#print out result of the Totaliser bit (see above)
##########################################################################################
# foreach (@totalised) {
# print "$_ $totaliser1{$_} \t $totaliser2{$_} \t $totaliser3{$_}\n";
# }
chomp $transcripts;
print STDERR "Loading transcripts file $transcripts\n";
open (FILE2, $transcripts);
$/ = '>'; # alter record separator for fasta/nbrf file
while (<FILE2>) {
chomp;
if ($_ =~ m/\S/i) {
if ($_ =~ m/\w\w;/ and $_ =~ /\*/i) { # This means it is an nbrf/pir file
($firstline, $aa_name, @aa_seq) = split "\n"; # name into $name, several seq lines into @Sequence
}
else { # its a FASTA file
($aa_name, @aa_seq) = split "\n"; # name into $name, several seq lines into @Sequence
}
$aa_name =~ s/\s.+$//g;
$aa_seq = join ('',@aa_seq); #now join up elements in @n_seq to create $n_seq.
$aa_seq =~ s/\s//g;
if ($aa_seq =~ /[A-Za-z]/) {
$aa_nameseq{$aa_name} = $aa_seq;
push @aa_spnames, $aa_name;
}
}
}
print STDERR "saving good transcripts to $outputfile\n";
open (OUTPUT, ">>$outputfile");
foreach my $ok (@ok_transcripts) {
print OUTPUT ">$ok\n$aa_nameseq{$ok}\n";
}
close OUTPUT;
print STDERR "saving bad transcripts to $bad_outputfile\n";
foreach my $bad (@bad_transcripts) {
print BAD ">$bad\n$aa_nameseq{$bad}\n";
}
close BAD;
# print "\nEQUAL\n";
# foreach my $equal (@equal_sized_hits) {
# print ">$equal\n$aa_nameseq{$equal}\n";
# }
#
# print "\nUNEQUAL\n";
# foreach my $UNequal (@UNequal_sized_hits) {
# print ">$UNequal\n$aa_nameseq{$UNequal}\n";
# }
#
# & scatter_plot;
#
# use lib '/Users/maxtelford/othersbin/ScatterPlot';
# use ScatterPlot;
#
# sub scatter_plot {
#
# my $plot = ScatterPlot->new(); # make a new ASCII_Plot object
#
# # define some example dataset
# my $x_size = 40;
# my $y_size = 15;
# my @xy_points = ([-0.5,0.5],[1,1.5],[-1.5,-1],[-2,-2],[2,2]);
#
# # draw the plot
# print "\n";
# print "\n";
# print "HTML call:\n";
# $plot->draw(\@xy_points, $x_size, $y_size, 'x_axis', 'y_axis', 'o', 'html');
# print "\n";
# print "\n";
# print "text call\n";
# $plot->draw(\@xy_points, $x_size, $y_size, 'x_axis', 'y_axis', 'o', 'text');
# print "\n";
# print "\n";
# print "Result with no input parameters:\n";
# $plot->draw();
#
# }
sub exit_error {