Loading README.md +22 −2 Original line number Diff line number Diff line Loading @@ -19,5 +19,25 @@ Pipeline for edna paired end data to do : * taxonomy assignation with ECOTAG To download the project ``` git clone https://gitlab.mbb.univ-montp2.fr/edna/bash_vsearch_ecotag.git ``` To run the whole pipeline into a single command ``` NCORES=16 nohup bash main.sh path/to/fastqdat path/to/refdatabase $NCORES & ``` * path/to/fastqdat : absolute path to a folder with fastq files and .dat information * path/to/refdatabase : absolute path to the folder of reference database * $NCORES : number of available cores for this job Example of command ``` PATH_FASTQC="/media/superdisk/edna/donnees/rhone_test/" PATH_REFDAT="/media/superdisk/edna/donnees/reference_database_teleo/" NCORES=16 nohup bash main.sh $PATH_FASTQC $PATH_REFDAT $NCORES & ``` main.sh +48 −1 Original line number Diff line number Diff line echo "coucou" No newline at end of file ############################################################################### # define global variable ## folder which contains .fastq data files and sample description .dat files FOLDER_FASTQ=$1 ## absolute path to the reference database REF_DATABASE=$2 ## prefix of the file name into reference database base_pref=`ls "$REF_DATABASE"/*sdx | sed 's/_[0-9][0-9][0-9].sdx//'g | awk -F/ '{print $NF}' | uniq` ## number of cores available CORES=$3 ############################################################################### ## list fastq files for i in `ls "$FOLDER_FASTQ"/*_R1.fastq.gz`; do echo $i | cut -d "." -f 1 | sed 's/_R1//g' done > liste_fq ##list sample description files .dat for i in `ls "$FOLDER_FASTQ"/*dat`; do echo $i done > liste_dat ## table of fastq and corresponding dat files paste liste_fq liste_dat > liste_fq_dat rm liste_fq liste_dat ## writing script bash : commands to apply on fastq/dat files while IFS= read -r var do echo "bash scripts/filter_and_demultiplex.sh "$var" "$CORES done < liste_fq_dat > fq_dat_cmd.sh ## run in parallel all commands parallel < fq_dat_cmd.sh ############################################################################### ## concatenate all the sample fas into a one OBI-fasta file ALL_FAS="main/all.fasta" python2 scripts/fas_to_obista.py -o $ALL_FAS ## ###dereplicate reads into uniq sequences dereplicated_all="${ALL_FAS/.fasta/.uniq.fasta}" obiuniq -m sample $ALL_FAS > $dereplicated_all ############################################################################### #REF_DATABASE="/media/superdisk/edna/donnees/reference_database_teleo/" #base_pref="embl_std" ##Assign each sequence to a taxon all_sample_sequences_tag="${dereplicated_all/.fasta/.tag.fasta}" ecotag -d "$REF_DATABASE"/"${base_pref}" -R "$REF_DATABASE"/db_"${base_pref}".fasta $dereplicated_all > $all_sample_sequences_tag scripts/fas_to_obifasta.py 0 → 100644 +54 −0 Original line number Diff line number Diff line #=============================================================================== #INFORMATIONS #=============================================================================== """ CEFE - EPHE - YERSIN 2018 guerin pierre-edouard creer un fichier OBIFASTA a partir des sequences SPY.fas generes par VSEARCH """ #=============================================================================== #MODULES #=============================================================================== import argparse import os import Bio from Bio import SeqIO from Bio.Alphabet import IUPAC from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord #=============================================================================== #ARGUMENTS #=============================================================================== parser = argparse.ArgumentParser('convert fasta to obiconvert format') parser.add_argument("-o","--output",type=str) #parser.add_argument("-f","--ena_fasta",type=str) #=============================================================================== #MAIN #=============================================================================== outputFile="all.fasta" args = parser.parse_args() outputFile = args.output mes_records=[] for fasFile in os.listdir("main/"): sampleName, fasExt = os.path.splitext(fasFile) if fasExt == ".fas": fasPath="main/"+fasFile print fasFile for seq_record in SeqIO.parse(fasPath, "fasta",alphabet=IUPAC.unambiguous_dna): seq_record_idSplit=seq_record.id.split(";") countFas=seq_record_idSplit[1].split("=")[1] IDFas=seq_record_idSplit[0] local_id=IDFas+" count="+countFas+"; merged_sample={'"+sampleName+"': "+countFas+"};" local_seq=str(repr(seq_record.seq.lower().tostring())).replace("'","") local_record=SeqRecord(Seq(local_seq,IUPAC.unambiguous_dna), id=local_id,description="") mes_records.append(local_record) SeqIO.write(mes_records, outputFile, "fasta") scripts/filter_and_demultiplex.sh 0 → 100644 +107 −0 Original line number Diff line number Diff line ## define global variables ### Fastq paired-end file absolute path R1_fastq=$1"_R1.fastq.gz" R2_fastq=$1"_R2.fastq.gz" ### prefix of the run pref=`echo $1 | rev | cut -d "/" -f1 | rev` ### sample description .dat file absolute path sample_description_file=$2 ### local working directory #### where to store intermediate files main_dir=$(pwd)/main #### where to store final results fin_dir=$(pwd)/final #### number of available cores THREADS=$3 ##Check quality encoding (33 or 64?) #zcat ${R1_fastq} | sed -n 1,400p > $main_dir/subset.fastq #vsearch --fastq_chars $main_dir/subset.fastq 2> $main_dir/subset.log ## Original Sanger format (phred+33) ENCODING=33 ## Merge read pairs MERGED="$main_dir"/"$pref".assembled.fastq vsearch \ --threads ${THREADS} \ --fastq_mergepairs ${R1_fastq} \ --reverse ${R2_fastq} \ --fastq_ascii ${ENCODING} \ --fastqout ${MERGED} \ --fastq_allowmergestagger \ --quiet 2>> ${MERGED/.fastq/.log} ## Demultiplexing, primer clipping, sample dereplication and quality extraction ### global variables used for demultiplexing INPUT=${MERGED} TAGS=$sample_description_file PRIMER_F=`awk '{ print $4 }' ${sample_description_file} | uniq` PRIMER_R=`awk '{ print $5 }' ${sample_description_file} | uniq | tr "[ATGCUatgcuNnYyRrSsWwKkMmBbDdHhVv]" "[TACGAtacgaNnRrYySsWwMmKkVvHhDdBb]" | rev` MIN_LENGTH=20 MAX_LENGTH=120 MIN_F=$(( ${#PRIMER_F} * 2 / 3 )) MIN_R=$(( ${#PRIMER_R} * 2 / 3 )) ### Define binaries, temporary files and output files CUTADAPT="$(which cutadapt) --discard-untrimmed -m ${MIN_LENGTH}" TMP_FASTQ="$main_dir"/"$pref".tmp_fastq TMP_FASTQ2="$main_dir"/"$pref".tmp_fastq2 TMP_FASTA="$main_dir"/"$pref".tmp_fasta OUTPUT="$main_dir"/"$pref".tmp_output QUALITY_FILE="${INPUT/.fastq/.qual}" ### it's necessary to generate reverse complement fastq due to cutadapt limitations INPUT_REVCOMP="${INPUT/.fastq/_rev.fastq}" vsearch --quiet \ --fastx_revcomp "${INPUT}" \ --fastqout "${INPUT_REVCOMP}" ### demultiplexing... awk '{ print $2"\t"$3}' ${TAGS} | while read TAG_NAME TAG_SEQ ; do LOG="$main_dir"/"${TAG_NAME}".log FINAL_FASTA="$main_dir"/"${TAG_NAME}".fas RTAG_SEQ=`echo $TAG_SEQ | tr "[ATGCUatgcuNnYyRrSsWwKkMmBbDdHhVv]" "[TACGAtacgaNnRrYySsWwMmKkVvHhDdBb]" | rev` #### Trim tags, forward & reverse primers (search normal and antisens) cat "${INPUT}" "${INPUT_REVCOMP}" | \ ${CUTADAPT} -g "${TAG_SEQ}" -O "${#TAG_SEQ}" - 2> "${LOG}" | \ ${CUTADAPT} -g "${PRIMER_F}" -O "${MIN_F}" - 2>> "${LOG}" | \ ${CUTADAPT} -a "${RTAG_SEQ}" -O "${#RTAG_SEQ}" - 2>> "${LOG}" | \ ${CUTADAPT} -a "${PRIMER_R}" -O "${MIN_R}" - 2>> "${LOG}" | \ cutadapt -M ${MAX_LENGTH} - 2>> "${LOG}" > "${TAG_NAME}"_tmp mv "${TAG_NAME}"_tmp "${TMP_FASTQ}" #### Discard sequences containing Ns, add expected error rates vsearch \ --quiet \ --fastq_filter "${TMP_FASTQ}" \ --fastq_maxns 0 \ --relabel_sha1 \ --eeout \ --fastqout "${TMP_FASTQ2}" 2>> "${LOG}" #### Discard sequences containing Ns, convert to fasta vsearch \ --quiet \ --fastq_filter "${TMP_FASTQ}" \ --fastq_maxns 0 \ --relabel_sha1 \ --eeout \ --fastqout "${TMP_FASTQ2}" 2>> "${LOG}" #### Discard sequences containing Ns, convert to fasta vsearch \ --quiet \ --fastq_filter "${TMP_FASTQ}" \ --fastq_maxns 0 \ --fastaout "${TMP_FASTA}" 2>> "${LOG}" #### Dereplicate at the study level vsearch \ --quiet \ --derep_fulllength "${TMP_FASTA}" \ --sizeout \ --fasta_width 0 \ --relabel_sha1 \ --output "${FINAL_FASTA}" 2>> "${LOG}" #### Discard quality lines, extract hash, expected error rates and read length sed 'n;n;N;d' "${TMP_FASTQ2}" | \ awk 'BEGIN {FS = "[;=]"} {if (/^@/) {printf "%s\t%s\t", $1, $3} else {print length($1)}}' | \ tr -d "@" >> "${OUTPUT}" done ## Produce the final quality file sort -k3,3n -k1,1d -k2,2n "${OUTPUT}" | uniq --check-chars=40 > "${QUALITY_FILE}" ## Clean rm -f "${TMP_FASTQ}" "${TMP_FASTA}" "${TMP_FASTQ2}" "${OUTPUT}" Loading
README.md +22 −2 Original line number Diff line number Diff line Loading @@ -19,5 +19,25 @@ Pipeline for edna paired end data to do : * taxonomy assignation with ECOTAG To download the project ``` git clone https://gitlab.mbb.univ-montp2.fr/edna/bash_vsearch_ecotag.git ``` To run the whole pipeline into a single command ``` NCORES=16 nohup bash main.sh path/to/fastqdat path/to/refdatabase $NCORES & ``` * path/to/fastqdat : absolute path to a folder with fastq files and .dat information * path/to/refdatabase : absolute path to the folder of reference database * $NCORES : number of available cores for this job Example of command ``` PATH_FASTQC="/media/superdisk/edna/donnees/rhone_test/" PATH_REFDAT="/media/superdisk/edna/donnees/reference_database_teleo/" NCORES=16 nohup bash main.sh $PATH_FASTQC $PATH_REFDAT $NCORES & ```
main.sh +48 −1 Original line number Diff line number Diff line echo "coucou" No newline at end of file ############################################################################### # define global variable ## folder which contains .fastq data files and sample description .dat files FOLDER_FASTQ=$1 ## absolute path to the reference database REF_DATABASE=$2 ## prefix of the file name into reference database base_pref=`ls "$REF_DATABASE"/*sdx | sed 's/_[0-9][0-9][0-9].sdx//'g | awk -F/ '{print $NF}' | uniq` ## number of cores available CORES=$3 ############################################################################### ## list fastq files for i in `ls "$FOLDER_FASTQ"/*_R1.fastq.gz`; do echo $i | cut -d "." -f 1 | sed 's/_R1//g' done > liste_fq ##list sample description files .dat for i in `ls "$FOLDER_FASTQ"/*dat`; do echo $i done > liste_dat ## table of fastq and corresponding dat files paste liste_fq liste_dat > liste_fq_dat rm liste_fq liste_dat ## writing script bash : commands to apply on fastq/dat files while IFS= read -r var do echo "bash scripts/filter_and_demultiplex.sh "$var" "$CORES done < liste_fq_dat > fq_dat_cmd.sh ## run in parallel all commands parallel < fq_dat_cmd.sh ############################################################################### ## concatenate all the sample fas into a one OBI-fasta file ALL_FAS="main/all.fasta" python2 scripts/fas_to_obista.py -o $ALL_FAS ## ###dereplicate reads into uniq sequences dereplicated_all="${ALL_FAS/.fasta/.uniq.fasta}" obiuniq -m sample $ALL_FAS > $dereplicated_all ############################################################################### #REF_DATABASE="/media/superdisk/edna/donnees/reference_database_teleo/" #base_pref="embl_std" ##Assign each sequence to a taxon all_sample_sequences_tag="${dereplicated_all/.fasta/.tag.fasta}" ecotag -d "$REF_DATABASE"/"${base_pref}" -R "$REF_DATABASE"/db_"${base_pref}".fasta $dereplicated_all > $all_sample_sequences_tag
scripts/fas_to_obifasta.py 0 → 100644 +54 −0 Original line number Diff line number Diff line #=============================================================================== #INFORMATIONS #=============================================================================== """ CEFE - EPHE - YERSIN 2018 guerin pierre-edouard creer un fichier OBIFASTA a partir des sequences SPY.fas generes par VSEARCH """ #=============================================================================== #MODULES #=============================================================================== import argparse import os import Bio from Bio import SeqIO from Bio.Alphabet import IUPAC from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord #=============================================================================== #ARGUMENTS #=============================================================================== parser = argparse.ArgumentParser('convert fasta to obiconvert format') parser.add_argument("-o","--output",type=str) #parser.add_argument("-f","--ena_fasta",type=str) #=============================================================================== #MAIN #=============================================================================== outputFile="all.fasta" args = parser.parse_args() outputFile = args.output mes_records=[] for fasFile in os.listdir("main/"): sampleName, fasExt = os.path.splitext(fasFile) if fasExt == ".fas": fasPath="main/"+fasFile print fasFile for seq_record in SeqIO.parse(fasPath, "fasta",alphabet=IUPAC.unambiguous_dna): seq_record_idSplit=seq_record.id.split(";") countFas=seq_record_idSplit[1].split("=")[1] IDFas=seq_record_idSplit[0] local_id=IDFas+" count="+countFas+"; merged_sample={'"+sampleName+"': "+countFas+"};" local_seq=str(repr(seq_record.seq.lower().tostring())).replace("'","") local_record=SeqRecord(Seq(local_seq,IUPAC.unambiguous_dna), id=local_id,description="") mes_records.append(local_record) SeqIO.write(mes_records, outputFile, "fasta")
scripts/filter_and_demultiplex.sh 0 → 100644 +107 −0 Original line number Diff line number Diff line ## define global variables ### Fastq paired-end file absolute path R1_fastq=$1"_R1.fastq.gz" R2_fastq=$1"_R2.fastq.gz" ### prefix of the run pref=`echo $1 | rev | cut -d "/" -f1 | rev` ### sample description .dat file absolute path sample_description_file=$2 ### local working directory #### where to store intermediate files main_dir=$(pwd)/main #### where to store final results fin_dir=$(pwd)/final #### number of available cores THREADS=$3 ##Check quality encoding (33 or 64?) #zcat ${R1_fastq} | sed -n 1,400p > $main_dir/subset.fastq #vsearch --fastq_chars $main_dir/subset.fastq 2> $main_dir/subset.log ## Original Sanger format (phred+33) ENCODING=33 ## Merge read pairs MERGED="$main_dir"/"$pref".assembled.fastq vsearch \ --threads ${THREADS} \ --fastq_mergepairs ${R1_fastq} \ --reverse ${R2_fastq} \ --fastq_ascii ${ENCODING} \ --fastqout ${MERGED} \ --fastq_allowmergestagger \ --quiet 2>> ${MERGED/.fastq/.log} ## Demultiplexing, primer clipping, sample dereplication and quality extraction ### global variables used for demultiplexing INPUT=${MERGED} TAGS=$sample_description_file PRIMER_F=`awk '{ print $4 }' ${sample_description_file} | uniq` PRIMER_R=`awk '{ print $5 }' ${sample_description_file} | uniq | tr "[ATGCUatgcuNnYyRrSsWwKkMmBbDdHhVv]" "[TACGAtacgaNnRrYySsWwMmKkVvHhDdBb]" | rev` MIN_LENGTH=20 MAX_LENGTH=120 MIN_F=$(( ${#PRIMER_F} * 2 / 3 )) MIN_R=$(( ${#PRIMER_R} * 2 / 3 )) ### Define binaries, temporary files and output files CUTADAPT="$(which cutadapt) --discard-untrimmed -m ${MIN_LENGTH}" TMP_FASTQ="$main_dir"/"$pref".tmp_fastq TMP_FASTQ2="$main_dir"/"$pref".tmp_fastq2 TMP_FASTA="$main_dir"/"$pref".tmp_fasta OUTPUT="$main_dir"/"$pref".tmp_output QUALITY_FILE="${INPUT/.fastq/.qual}" ### it's necessary to generate reverse complement fastq due to cutadapt limitations INPUT_REVCOMP="${INPUT/.fastq/_rev.fastq}" vsearch --quiet \ --fastx_revcomp "${INPUT}" \ --fastqout "${INPUT_REVCOMP}" ### demultiplexing... awk '{ print $2"\t"$3}' ${TAGS} | while read TAG_NAME TAG_SEQ ; do LOG="$main_dir"/"${TAG_NAME}".log FINAL_FASTA="$main_dir"/"${TAG_NAME}".fas RTAG_SEQ=`echo $TAG_SEQ | tr "[ATGCUatgcuNnYyRrSsWwKkMmBbDdHhVv]" "[TACGAtacgaNnRrYySsWwMmKkVvHhDdBb]" | rev` #### Trim tags, forward & reverse primers (search normal and antisens) cat "${INPUT}" "${INPUT_REVCOMP}" | \ ${CUTADAPT} -g "${TAG_SEQ}" -O "${#TAG_SEQ}" - 2> "${LOG}" | \ ${CUTADAPT} -g "${PRIMER_F}" -O "${MIN_F}" - 2>> "${LOG}" | \ ${CUTADAPT} -a "${RTAG_SEQ}" -O "${#RTAG_SEQ}" - 2>> "${LOG}" | \ ${CUTADAPT} -a "${PRIMER_R}" -O "${MIN_R}" - 2>> "${LOG}" | \ cutadapt -M ${MAX_LENGTH} - 2>> "${LOG}" > "${TAG_NAME}"_tmp mv "${TAG_NAME}"_tmp "${TMP_FASTQ}" #### Discard sequences containing Ns, add expected error rates vsearch \ --quiet \ --fastq_filter "${TMP_FASTQ}" \ --fastq_maxns 0 \ --relabel_sha1 \ --eeout \ --fastqout "${TMP_FASTQ2}" 2>> "${LOG}" #### Discard sequences containing Ns, convert to fasta vsearch \ --quiet \ --fastq_filter "${TMP_FASTQ}" \ --fastq_maxns 0 \ --relabel_sha1 \ --eeout \ --fastqout "${TMP_FASTQ2}" 2>> "${LOG}" #### Discard sequences containing Ns, convert to fasta vsearch \ --quiet \ --fastq_filter "${TMP_FASTQ}" \ --fastq_maxns 0 \ --fastaout "${TMP_FASTA}" 2>> "${LOG}" #### Dereplicate at the study level vsearch \ --quiet \ --derep_fulllength "${TMP_FASTA}" \ --sizeout \ --fasta_width 0 \ --relabel_sha1 \ --output "${FINAL_FASTA}" 2>> "${LOG}" #### Discard quality lines, extract hash, expected error rates and read length sed 'n;n;N;d' "${TMP_FASTQ2}" | \ awk 'BEGIN {FS = "[;=]"} {if (/^@/) {printf "%s\t%s\t", $1, $3} else {print length($1)}}' | \ tr -d "@" >> "${OUTPUT}" done ## Produce the final quality file sort -k3,3n -k1,1d -k2,2n "${OUTPUT}" | uniq --check-chars=40 > "${QUALITY_FILE}" ## Clean rm -f "${TMP_FASTQ}" "${TMP_FASTA}" "${TMP_FASTQ2}" "${OUTPUT}"