build_bdr.sh 2.86 KB
Newer Older
peguerin's avatar
peguerin committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
##########################################################################
## Codes for scientific papers related to metabarcoding studies
##
## AUTHORS
## =======
## * Pierre-Edouard Guerin   | pierre-edouard.guerin@cefe.cnrs.fr
## * Virginie Marques        | virginie.marques@etu.umontpellier.fr
## * CNRS/CEFE, CNRS/MARBEC  | Montpellier, France
## * 2018-2020
##
##
## Inspired by the Eric Coissac et al. obitools
## Molecular Ecology Resources 2015
##
## DESCRIPTION
## ===========
##
## Build a reference database for "miseq" sequences data.
## We recommand to launch each command line by line into linux terminal
## to check each step results
##
##
##
## USAGE
## =====
## bash buil_bdr.sh
##
##
##########################################################################
peguerin's avatar
peguerin committed
30
31
32
33
34
35

## load an environment with obitools
SINGULARITY_SIMG="/media/superdisk/utils/conteneurs/obitools.simg"
singularity shell --bind /media/superdisk:/media/superdisk $SINGULARITY_SIMG


peguerin's avatar
peguerin committed
36
## configure arguments value
peguerin's avatar
peguerin committed
37
source ./config.sh
peguerin's avatar
peguerin committed
38
## download the sequences
peguerin's avatar
peguerin committed
39
40
41
42
43
mkdir EMBL
cd EMBL
wget ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/std/*
gzip -d *
cd ..
peguerin's avatar
peguerin committed
44
## download taxonomy
peguerin's avatar
peguerin committed
45
46
47
48
49
mkdir TAXO
cd TAXO
wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
tar -zxvf taxdump.tar.gz
cd ..
peguerin's avatar
peguerin committed
50

peguerin's avatar
peguerin committed
51
## add mitofish sequences (by default we skip this step)
peguerin's avatar
peguerin committed
52
53
54
55
56
57
58
59
60
if [ $MITOFISH == 'y' ]
then
 echo "adding sequences from mitofish..."
 bash scripts/add_sequences_from_mitofish.sh
 obiconvert --skip-on-error --fasta -t ./TAXO --ecopcrdb-output=mitofish/"${rd_prefix}" mitofish/mitogene_12S.fasta
else
 echo "skip adding sequences from mitofish"
fi

peguerin's avatar
peguerin committed
61
## format the data
peguerin's avatar
peguerin committed
62
obiconvert --skip-on-error --embl -t ./TAXO --ecopcrdb-output="${rd_prefix}" EMBL/rel_std_*.dat
peguerin's avatar
peguerin committed
63
64
## ecoPCR to simulate an in silico PCR
#### 50 :: change to 20 : lost of lamproie
peguerin's avatar
peguerin committed
65
ecoPCR -d "${rd_prefix}" -e "${ecoPCR_e}" -l "${ecoPCR_l}" -L "${ecoPCR_L}" "${primer5}" "${primer3}" > v_"${rd_prefix}".ecopcr
peguerin's avatar
peguerin committed
66
## clean the database
peguerin's avatar
peguerin committed
67
68
69
70
71
72
73
74
## filter sequences so that they have a good taxonomic description at the species genus and family levels
obigrep -d "${rd_prefix}" --require-rank=species --require-rank=genus --require-rank=family v_"${rd_prefix}".ecopcr > v_"${rd_prefix}"_clean.fasta
## remove redundant sequences
obiuniq -d "${rd_prefix}" v_"${rd_prefix}"_clean.fasta > v_"${rd_prefix}"_clean_uniq.fasta
## ensure that the dereplicated sequences have a taxid at the family level
obigrep -d "${rd_prefix}" --require-rank=family v_"${rd_prefix}"_clean_uniq.fasta > v_"${rd_prefix}"_clean_uniq_clean.fasta
## ensure that sequences each have a unique identification
obiannotate --uniq-id v_"${rd_prefix}"_clean_uniq_clean.fasta > db_"${rd_prefix}".fasta
peguerin's avatar
peguerin committed
75
## your reference database is built !
peguerin's avatar
peguerin committed
76

peguerin's avatar
peguerin committed
77
78


peguerin's avatar
peguerin committed
79
80
#obiconvert --skip-on-error --fasta -t ./TAXO --ecopcrdb-output=mitofish/"${rd_prefix}" mitofish/mitogene_12S.fasta
#obiconvert --skip-on-error --embl -t ./TAXO --ecopcrdb-output="${rd_prefix}" EMBL/rel_std_*.dat
peguerin's avatar
peguerin committed
81
82