Commit 2ba262e6 authored by peguerin's avatar peguerin

scripts

parent 52313f4b
......@@ -19,5 +19,25 @@ Pipeline for edna paired end data to do :
* taxonomy assignation with ECOTAG
git clone https://gitlab.mbb.univ-montp2.fr/edna/bash_vsearch_ecotag.git
\ No newline at end of file
To download the project
```
git clone https://gitlab.mbb.univ-montp2.fr/edna/bash_vsearch_ecotag.git
```
To run the whole pipeline into a single command
```
NCORES=16
nohup bash main.sh path/to/fastqdat path/to/refdatabase $NCORES &
```
* path/to/fastqdat : absolute path to a folder with fastq files and .dat information
* path/to/refdatabase : absolute path to the folder of reference database
* $NCORES : number of available cores for this job
Example of command
```
PATH_FASTQC="/media/superdisk/edna/donnees/rhone_test/"
PATH_REFDAT="/media/superdisk/edna/donnees/reference_database_teleo/"
NCORES=16
nohup bash main.sh $PATH_FASTQC $PATH_REFDAT $NCORES &
```
echo "coucou"
\ No newline at end of file
###############################################################################
# define global variable
## folder which contains .fastq data files and sample description .dat files
FOLDER_FASTQ=$1
## absolute path to the reference database
REF_DATABASE=$2
## prefix of the file name into reference database
base_pref=`ls "$REF_DATABASE"/*sdx | sed 's/_[0-9][0-9][0-9].sdx//'g | awk -F/ '{print $NF}' | uniq`
## number of cores available
CORES=$3
###############################################################################
## list fastq files
for i in `ls "$FOLDER_FASTQ"/*_R1.fastq.gz`;
do
echo $i | cut -d "." -f 1 | sed 's/_R1//g'
done > liste_fq
##list sample description files .dat
for i in `ls "$FOLDER_FASTQ"/*dat`;
do
echo $i
done > liste_dat
## table of fastq and corresponding dat files
paste liste_fq liste_dat > liste_fq_dat
rm liste_fq liste_dat
## writing script bash : commands to apply on fastq/dat files
while IFS= read -r var
do
echo "bash scripts/filter_and_demultiplex.sh "$var" "$CORES
done < liste_fq_dat > fq_dat_cmd.sh
## run in parallel all commands
parallel < fq_dat_cmd.sh
###############################################################################
## concatenate all the sample fas into a one OBI-fasta file
ALL_FAS="main/all.fasta"
python2 scripts/fas_to_obista.py -o $ALL_FAS
## ###dereplicate reads into uniq sequences
dereplicated_all="${ALL_FAS/.fasta/.uniq.fasta}"
obiuniq -m sample $ALL_FAS > $dereplicated_all
###############################################################################
#REF_DATABASE="/media/superdisk/edna/donnees/reference_database_teleo/"
#base_pref="embl_std"
##Assign each sequence to a taxon
all_sample_sequences_tag="${dereplicated_all/.fasta/.tag.fasta}"
ecotag -d "$REF_DATABASE"/"${base_pref}" -R "$REF_DATABASE"/db_"${base_pref}".fasta $dereplicated_all > $all_sample_sequences_tag
#===============================================================================
#INFORMATIONS
#===============================================================================
"""
CEFE - EPHE - YERSIN 2018
guerin pierre-edouard
creer un fichier OBIFASTA a partir des sequences SPY.fas generes par VSEARCH
"""
#===============================================================================
#MODULES
#===============================================================================
import argparse
import os
import Bio
from Bio import SeqIO
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
#===============================================================================
#ARGUMENTS
#===============================================================================
parser = argparse.ArgumentParser('convert fasta to obiconvert format')
parser.add_argument("-o","--output",type=str)
#parser.add_argument("-f","--ena_fasta",type=str)
#===============================================================================
#MAIN
#===============================================================================
outputFile="all.fasta"
args = parser.parse_args()
outputFile = args.output
mes_records=[]
for fasFile in os.listdir("main/"):
sampleName, fasExt = os.path.splitext(fasFile)
if fasExt == ".fas":
fasPath="main/"+fasFile
print fasFile
for seq_record in SeqIO.parse(fasPath, "fasta",alphabet=IUPAC.unambiguous_dna):
seq_record_idSplit=seq_record.id.split(";")
countFas=seq_record_idSplit[1].split("=")[1]
IDFas=seq_record_idSplit[0]
local_id=IDFas+" count="+countFas+"; merged_sample={'"+sampleName+"': "+countFas+"};"
local_seq=str(repr(seq_record.seq.lower().tostring())).replace("'","")
local_record=SeqRecord(Seq(local_seq,IUPAC.unambiguous_dna), id=local_id,description="")
mes_records.append(local_record)
SeqIO.write(mes_records, outputFile, "fasta")
## define global variables
### Fastq paired-end file absolute path
R1_fastq=$1"_R1.fastq.gz"
R2_fastq=$1"_R2.fastq.gz"
### prefix of the run
pref=`echo $1 | rev | cut -d "/" -f1 | rev`
### sample description .dat file absolute path
sample_description_file=$2
### local working directory
#### where to store intermediate files
main_dir=$(pwd)/main
#### where to store final results
fin_dir=$(pwd)/final
#### number of available cores
THREADS=$3
##Check quality encoding (33 or 64?)
#zcat ${R1_fastq} | sed -n 1,400p > $main_dir/subset.fastq
#vsearch --fastq_chars $main_dir/subset.fastq 2> $main_dir/subset.log
## Original Sanger format (phred+33)
ENCODING=33
## Merge read pairs
MERGED="$main_dir"/"$pref".assembled.fastq
vsearch \
--threads ${THREADS} \
--fastq_mergepairs ${R1_fastq} \
--reverse ${R2_fastq} \
--fastq_ascii ${ENCODING} \
--fastqout ${MERGED} \
--fastq_allowmergestagger \
--quiet 2>> ${MERGED/.fastq/.log}
## Demultiplexing, primer clipping, sample dereplication and quality extraction
### global variables used for demultiplexing
INPUT=${MERGED}
TAGS=$sample_description_file
PRIMER_F=`awk '{ print $4 }' ${sample_description_file} | uniq`
PRIMER_R=`awk '{ print $5 }' ${sample_description_file} | uniq | tr "[ATGCUatgcuNnYyRrSsWwKkMmBbDdHhVv]" "[TACGAtacgaNnRrYySsWwMmKkVvHhDdBb]" | rev`
MIN_LENGTH=20
MAX_LENGTH=120
MIN_F=$(( ${#PRIMER_F} * 2 / 3 ))
MIN_R=$(( ${#PRIMER_R} * 2 / 3 ))
### Define binaries, temporary files and output files
CUTADAPT="$(which cutadapt) --discard-untrimmed -m ${MIN_LENGTH}"
TMP_FASTQ="$main_dir"/"$pref".tmp_fastq
TMP_FASTQ2="$main_dir"/"$pref".tmp_fastq2
TMP_FASTA="$main_dir"/"$pref".tmp_fasta
OUTPUT="$main_dir"/"$pref".tmp_output
QUALITY_FILE="${INPUT/.fastq/.qual}"
### it's necessary to generate reverse complement fastq due to cutadapt limitations
INPUT_REVCOMP="${INPUT/.fastq/_rev.fastq}"
vsearch --quiet \
--fastx_revcomp "${INPUT}" \
--fastqout "${INPUT_REVCOMP}"
### demultiplexing...
awk '{ print $2"\t"$3}' ${TAGS} | while read TAG_NAME TAG_SEQ ; do
LOG="$main_dir"/"${TAG_NAME}".log
FINAL_FASTA="$main_dir"/"${TAG_NAME}".fas
RTAG_SEQ=`echo $TAG_SEQ | tr "[ATGCUatgcuNnYyRrSsWwKkMmBbDdHhVv]" "[TACGAtacgaNnRrYySsWwMmKkVvHhDdBb]" | rev`
#### Trim tags, forward & reverse primers (search normal and antisens)
cat "${INPUT}" "${INPUT_REVCOMP}" | \
${CUTADAPT} -g "${TAG_SEQ}" -O "${#TAG_SEQ}" - 2> "${LOG}" | \
${CUTADAPT} -g "${PRIMER_F}" -O "${MIN_F}" - 2>> "${LOG}" | \
${CUTADAPT} -a "${RTAG_SEQ}" -O "${#RTAG_SEQ}" - 2>> "${LOG}" | \
${CUTADAPT} -a "${PRIMER_R}" -O "${MIN_R}" - 2>> "${LOG}" | \
cutadapt -M ${MAX_LENGTH} - 2>> "${LOG}" > "${TAG_NAME}"_tmp
mv "${TAG_NAME}"_tmp "${TMP_FASTQ}"
#### Discard sequences containing Ns, add expected error rates
vsearch \
--quiet \
--fastq_filter "${TMP_FASTQ}" \
--fastq_maxns 0 \
--relabel_sha1 \
--eeout \
--fastqout "${TMP_FASTQ2}" 2>> "${LOG}"
#### Discard sequences containing Ns, convert to fasta
vsearch \
--quiet \
--fastq_filter "${TMP_FASTQ}" \
--fastq_maxns 0 \
--relabel_sha1 \
--eeout \
--fastqout "${TMP_FASTQ2}" 2>> "${LOG}"
#### Discard sequences containing Ns, convert to fasta
vsearch \
--quiet \
--fastq_filter "${TMP_FASTQ}" \
--fastq_maxns 0 \
--fastaout "${TMP_FASTA}" 2>> "${LOG}"
#### Dereplicate at the study level
vsearch \
--quiet \
--derep_fulllength "${TMP_FASTA}" \
--sizeout \
--fasta_width 0 \
--relabel_sha1 \
--output "${FINAL_FASTA}" 2>> "${LOG}"
#### Discard quality lines, extract hash, expected error rates and read length
sed 'n;n;N;d' "${TMP_FASTQ2}" | \
awk 'BEGIN {FS = "[;=]"}
{if (/^@/) {printf "%s\t%s\t", $1, $3} else {print length($1)}}' | \
tr -d "@" >> "${OUTPUT}"
done
## Produce the final quality file
sort -k3,3n -k1,1d -k2,2n "${OUTPUT}" | uniq --check-chars=40 > "${QUALITY_FILE}"
## Clean
rm -f "${TMP_FASTQ}" "${TMP_FASTA}" "${TMP_FASTQ2}" "${OUTPUT}"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment