Commit eba8bb98 authored by peguerin's avatar peguerin
Browse files

raw master

parents
##definir les variables globales
EDNA_PATH=/media/superdisk/edna
DATA_PATH="$EDNA_PATH"/donnees/rhone_all
##liste des fichiers fastq
for i in `ls "$DATA_PATH"/*_R1.fastq.gz`;
do
basename $i | cut -d "." -f 1 | sed 's/_R1//g'
done > liste_fq
##liste des fichiers dat
for i in `ls "$DATA_PATH"/*dat`;
do
echo $i
done > liste_dat
##liste des fichiers fastq et dat correspondant
paste liste_fq liste_dat > liste_fq_dat
rm liste_fq liste_dat
##ecriture du script sh de toutes les commandes du pipeline sur chaque fq/dat
while IFS= read -r var
do
echo "bash pipeline_single.sh "$var
done < liste_fq_dat > fq_dat_cmd.sh
##definir les variables globales
EDNA_PATH=/media/superdisk/edna
#pref_fastq="161124_SND393_A_L005_GWM-849"
pref_fastq=$1
#pref="all_rhone"
pref=$1
pref_bdr="std"
#R1_fastq="$EDNA_PATH"/donnees/rhone_all/"$pref_fastq"_R1.fastq.gz
#R2_fastq="$EDNA_PATH"/donnees/rhone_all/"$pref_fastq"_R2.fastq.gz
R1_fastq="$EDNA_PATH"/donnees/rhone_all/"$pref_fastq"_R1.fastq.gz
R2_fastq="$EDNA_PATH"/donnees/rhone_all/"$pref_fastq"_R2.fastq.gz
#sample_description_file=$EDNA_PATH/donnees/rhone_all/MB1016K_Teleo.dat
sample_description_file=$2
base_dir=$EDNA_PATH/donnees/basedereference
main_dir=$EDNA_PATH/working/only_obitools/rhone_all/main
fin_dir=$EDNA_PATH/working/only_obitools/rhone_all/final
##[t=2h]paired end alignment then keep reads with quality > 40
illuminapairedend -r $R2_fastq $R1_fastq --score-min=40 > $main_dir/"$pref".fastq
##[t=1h]remove unaligned sequence records
obigrep -p 'mode!="joined"' $main_dir/"$pref".fastq > $main_dir/"$pref".ali.fastq
##[t=6h]assign each sequence record to the corresponding sample/marker combination
ngsfilter -t $sample_description_file -u $main_dir/"$pref"_unidentified.fastq $main_dir/"$pref".ali.fastq --fasta-output > $main_dir/"$pref".ali.assigned.fasta
##split the input sequence file in a set of subfiles according to the values of attribute `sample`
obisplit -p $main_dir/"$pref"_sample_ -t sample --fasta $main_dir/"$pref".ali.assigned.fasta
##liste file of samples
all_samples_parallel_cmd_sh=$main_dir/"$pref"_sample_parallel_cmd.sh
##PARALLEL
echo "" > $all_samples_parallel_cmd_sh
for sample in `ls $main_dir/"$pref"_sample_*.fasta`;
do
sample_sh="${sample/.fasta/_cmd.sh}"
echo "bash "$sample_sh >> $all_samples_parallel_cmd_sh
###dereplicate reads into uniq sequences
dereplicated_sample="${sample/.fasta/.uniq.fasta}"
echo "obiuniq -m sample "$sample" > "$dereplicated_sample > $sample_sh;
###only sequence more than 20bp with no ambiguity IUAPC with total coverage greater than 10 reads
good_sequence_sample="${dereplicated_sample/.fasta/.l20.fasta}"
echo "obigrep -p 'count>10' -s '^[ACGT]+$' -p 'seq_length>20' "$dereplicated_sample" > "$good_sequence_sample >> $sample_sh
###Clean the sequences for PCR/sequencing errors (sequence variants)
r_sequence_sample="${good_sequence_sample/.fasta/.r005.fasta}"
echo "obiclean -r 0.05 "$good_sequence_sample" > "$r_sequence_sample >> $sample_sh
###Remove sequence which are classified as 'internal' by obiclean
clean_sequence_sample="${r_sequence_sample/.fasta/.clean.fasta}"
echo "obigrep -p 'obiclean_internalcount == 0' "$r_sequence_sample" > "$clean_sequence_sample >> $sample_sh
done
parallel < $all_samples_parallel_cmd_sh
all_sample_sequences_clean=$main_dir/"$pref"_all_sample_clean.fasta
cat $main_dir/"$pref"_sample_*.uniq.l20.r005.clean.fasta > $all_sample_sequences_clean
##dereplicate and merge samples together
all_sample_sequences_uniq="${all_sample_sequences_clean/.fasta/.uniq.fasta}"
obiuniq -m sample $all_sample_sequences_clean > $all_sample_sequences_uniq
##Assign each sequence to a taxon
all_sample_sequences_tag="${all_sample_sequences_uniq/.fasta/.tag.fasta}"
ecotag -d "$base_dir"/embl_"$pref_bdr" -R $base_dir/db_"$pref_bdr".fasta $all_sample_sequences_uniq > $all_sample_sequences_tag
##Some unuseful attributes can be removed at this stage
all_sample_sequences_ann="${all_sample_sequences_tag/.fasta/.ann.fasta}"
obiannotate --delete-tag=scientific_name_by_db --delete-tag=obiclean_samplecount \
--delete-tag=obiclean_count --delete-tag=obiclean_singletoncount \
--delete-tag=obiclean_cluster --delete-tag=obiclean_internalcount \
--delete-tag=obiclean_head --delete-tag=obiclean_headcount \
--delete-tag=id_status --delete-tag=rank_by_db --delete-tag=obiclean_status \
--delete-tag=seq_length_ori --delete-tag=sminL --delete-tag=sminR \
--delete-tag=reverse_score --delete-tag=reverse_primer --delete-tag=reverse_match --delete-tag=reverse_tag \
--delete-tag=forward_tag --delete-tag=forward_score --delete-tag=forward_primer --delete-tag=forward_match \
--delete-tag=tail_quality $all_sample_sequences_tag > $all_sample_sequences_ann
##The sequences can be sorted by decreasing order of count
all_sample_sequences_sort="${all_sample_sequences_ann/.fasta/.sort.fasta}"
obisort -k count -r $all_sample_sequences_ann > $all_sample_sequences_sort
##generate a table final results
obitab -o $all_sample_sequences_sort > $fin_dir/"$pref".csv
##definir les variables globales
EDNA_PATH=/media/superdisk/edna
#pref_fastq="161124_SND393_A_L005_GWM-849"
pref_fastq=$1
#pref="all_rhone"
pref=$1
pref_bdr="std"
#R1_fastq="$EDNA_PATH"/donnees/rhone_all/"$pref_fastq"_R1.fastq.gz
#R2_fastq="$EDNA_PATH"/donnees/rhone_all/"$pref_fastq"_R2.fastq.gz
R1_fastq="$EDNA_PATH"/donnees/rhone_all/"$pref_fastq"_R1.fastq.gz
R2_fastq="$EDNA_PATH"/donnees/rhone_all/"$pref_fastq"_R2.fastq.gz
#sample_description_file=$EDNA_PATH/donnees/rhone_all/MB1016K_Teleo.dat
sample_description_file=$2
base_dir=$EDNA_PATH/donnees/basedereference
main_dir=$EDNA_PATH/working/only_obitools/rhone_all/main
fin_dir=$EDNA_PATH/working/only_obitools/rhone_all/final
all_samples_parallel_cmd_sh=$main_dir/"$pref"_sample_parallel_cmd.sh
##PARALLEL
echo "" > $all_samples_parallel_cmd_sh
for sample in `ls $main_dir/"$pref"_sample_*.fasta`;
do
sample_sh="${sample/.fasta/_cmd.sh}"
echo "bash "$sample_sh >> $all_samples_parallel_cmd_sh
dereplicated_sample="${sample/.fasta/.uniq.fasta}"
###only sequence more than 20bp with no ambiguity IUAPC with total coverage greater than 10 reads
good_sequence_sample="${dereplicated_sample/.fasta/.l20.fasta}"
###Clean the sequences for PCR/sequencing errors (sequence variants)
r_sequence_sample="${good_sequence_sample/.fasta/.r005.fasta}"
###Remove sequence which are classified as 'internal' by obiclean
clean_sequence_sample="${r_sequence_sample/.fasta/.clean.fasta}"
done
parallel < $all_samples_parallel_cmd_sh
all_sample_sequences_clean=$main_dir/"$pref"_all_sample_clean.fasta
cat $main_dir/"$pref"_sample_*.uniq.l20.r005.clean.fasta > $all_sample_sequences_clean
##dereplicate and merge samples together
all_sample_sequences_uniq="${all_sample_sequences_clean/.fasta/.uniq.fasta}"
##Assign each sequence to a taxon
ecotag -d "$base_dir"/embl_"$pref_bdr" -R $base_dir/db_"$pref_bdr".fasta $all_sample_sequences_uniq > $all_sample_sequences_tag
all_sample_sequences_tag="${all_sample_sequences_uniq/.fasta/.tag.fasta}"
##Some unuseful attributes can be removed at this stage
all_sample_sequences_ann="${all_sample_sequences_tag/.fasta/.ann.fasta}"
obiannotate --delete-tag=scientific_name_by_db --delete-tag=obiclean_samplecount \
--delete-tag=obiclean_count --delete-tag=obiclean_singletoncount \
--delete-tag=obiclean_cluster --delete-tag=obiclean_internalcount \
--delete-tag=obiclean_head --delete-tag=obiclean_headcount \
--delete-tag=id_status --delete-tag=rank_by_db --delete-tag=obiclean_status \
--delete-tag=seq_length_ori --delete-tag=sminL --delete-tag=sminR \
--delete-tag=reverse_score --delete-tag=reverse_primer --delete-tag=reverse_match --delete-tag=reverse_tag \
--delete-tag=forward_tag --delete-tag=forward_score --delete-tag=forward_primer --delete-tag=forward_match \
--delete-tag=tail_quality --with-taxon-at-rank=class --delete-tag=order $all_sample_sequences_tag > $all_sample_sequences_ann
##The sequences can be sorted by decreasing order of count
all_sample_sequences_sort="${all_sample_sequences_ann/.fasta/.sort.fasta}"
obisort -k count -r $all_sample_sequences_ann > $all_sample_sequences_sort
##generate a table final results
obitab -o $all_sample_sequences_sort > $fin_dir/"$pref".csv
##definir les variables globales
EDNA_PATH=/media/superdisk/edna
#pref_fastq="161124_SND393_A_L005_GWM-849"
pref_fastq=$1
#pref="all_rhone"
pref=$1
pref_bdr="std"
#R1_fastq="$EDNA_PATH"/donnees/rhone_all/"$pref_fastq"_R1.fastq.gz
#R2_fastq="$EDNA_PATH"/donnees/rhone_all/"$pref_fastq"_R2.fastq.gz
R1_fastq="$EDNA_PATH"/donnees/rhone_all/"$pref_fastq"_R1.fastq.gz
R2_fastq="$EDNA_PATH"/donnees/rhone_all/"$pref_fastq"_R2.fastq.gz
#sample_description_file=$EDNA_PATH/donnees/rhone_all/MB1016K_Teleo.dat
sample_description_file=$2
base_dir=$EDNA_PATH/donnees/basedereference
main_dir=$EDNA_PATH/working/only_obitools/rhone_all/main
main_dir2=$EDNA_PATH/working/only_obitools/rhone_all/main_spygen
fin_dir=$EDNA_PATH/working/only_obitools/rhone_all/final_spygen
all_samples_parallel_cmd_sh=$main_dir/"$pref"_sample_parallel_cmd.sh
##PARALLEL
#echo "" > $all_samples_parallel_cmd_sh
#for sample in `ls $main_dir/"$pref"_sample_*.fasta`;
#do
#sample_sh="${sample/.fasta/_cmd.sh}"
#echo "bash "$sample_sh >> $all_samples_parallel_cmd_sh
#dereplicated_sample="${sample/.fasta/.uniq.fasta}"
###only sequence more than 20bp with no ambiguity IUAPC with total coverage greater than 10 reads
#good_sequence_sample="${dereplicated_sample/.fasta/.l20.fasta}"
###Clean the sequences for PCR/sequencing errors (sequence variants)
#r_sequence_sample="${good_sequence_sample/.fasta/.r005.fasta}"
###Remove sequence which are classified as 'internal' by obiclean
#clean_sequence_sample="${r_sequence_sample/.fasta/.clean.fasta}"
#done
#parallel < $all_samples_parallel_cmd_sh
all_sample_sequences_clean=$main_dir2/"$pref"_all_sample_clean.fasta
cat $main_dir/"$pref"_sample_*.uniq.l20.r005.clean.fasta > $all_sample_sequences_clean
##dereplicate and merge samples together
all_sample_sequences_uniq="${all_sample_sequences_clean/.fasta/.uniq.fasta}"
obiuniq -m sample $all_sample_sequences_clean > $all_sample_sequences_uniq
##Assign each sequence to a taxon
all_sample_sequences_tag="${all_sample_sequences_uniq/.fasta/.tag.fasta}"
ecotag -d "$base_dir"/embl_"$pref_bdr" -R "$EDNA_PATH"/donnees/rhone_all/teleo_V1_0_VM.fasta $all_sample_sequences_uniq > $all_sample_sequences_tag
##Some unuseful attributes can be removed at this stage
all_sample_sequences_ann="${all_sample_sequences_tag/.fasta/.ann.fasta}"
obiannotate --delete-tag=scientific_name_by_db --delete-tag=obiclean_samplecount \
--delete-tag=obiclean_count --delete-tag=obiclean_singletoncount \
--delete-tag=obiclean_cluster --delete-tag=obiclean_internalcount \
--delete-tag=obiclean_head --delete-tag=obiclean_headcount \
--delete-tag=id_status --delete-tag=obiclean_status \
--delete-tag=seq_length_ori --delete-tag=sminL --delete-tag=sminR \
--delete-tag=reverse_score --delete-tag=reverse_primer --delete-tag=reverse_match --delete-tag=reverse_tag \
--delete-tag=forward_tag --delete-tag=forward_score --delete-tag=forward_primer --delete-tag=forward_match \
--delete-tag=tail_quality $all_sample_sequences_tag > $all_sample_sequences_ann
##The sequences can be sorted by decreasing order of count
all_sample_sequences_sort="${all_sample_sequences_ann/.fasta/.sort.fasta}"
obisort -k count -r $all_sample_sequences_ann > $all_sample_sequences_sort
##generate a table final results
obitab -o $all_sample_sequences_sort > $fin_dir/"$pref".csv
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment