Commit 4dcb0b04 authored by peguerin's avatar peguerin

mitofish

parent 628e8d35
# exclude everything
mitofish/*
......@@ -68,10 +68,8 @@ In addition, it includes `EMBL` folder which contains all the sequences and `TAX
Now, your reference database can be used for taxonomic assignment in our pipeline to generate species environmental presence from raw eDNA data.
You can use the absolute path of the folder of your reference database as the `/path/to/reference_database` argument in [only_obitools](http://gitlab.mbb.univ-montp2.fr/edna/only_obitools) and [snakemake_only_obitools](http://gitlab.mbb.univ-montp2.fr/edna/snakemake_only_obitools)
You can use the absolute path of the folder of your reference database as the `/path/to/reference_database` argument in the following pipelines :
* [only_obitools](http://gitlab.mbb.univ-montp2.fr/edna/only_obitools)
* [nextflow_obitools](https://gitlab.mbb.univ-montp2.fr/edna/nextflow_obitoolsand)
* [snakemake_only_obitools](http://gitlab.mbb.univ-montp2.fr/edna/snakemake_only_obitools)
......@@ -14,6 +14,16 @@ cd TAXO
wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
tar -zxvf taxdump.tar.gz
cd ..
if [ $MITOFISH == 'y' ]
then
echo "adding sequences from mitofish..."
bash scripts/add_sequences_from_mitofish.sh
obiconvert --skip-on-error --fasta -t ./TAXO --ecopcrdb-output=mitofish/"${rd_prefix}" mitofish/mitogene_12S.fasta
else
echo "skip adding sequences from mitofish"
fi
# format the data
obiconvert --skip-on-error --embl -t ./TAXO --ecopcrdb-output="${rd_prefix}" EMBL/rel_std_*.dat
# ecoPCR to simulate an in silico PCR
......
# argument values for building reference database
## "y" add sequences from mitofish the reference database
## "n" don't add sequences from mitofish
MITOFISH="n"
## reference database prefix
rd_prefix="embl_std"
......
## add sequences of gene from mitofish database
mkdir mitofish
## Downloads mitogenomes
mkdir mitofish/mitogenomes
wget http://mitofish.aori.u-tokyo.ac.jp/files/mitogenomes.zip --directory-prefix=mitofish/
unzip mitofish/mitogenomes.zip -d mitofish/mitogenomes
## Downloads gene annotations of mitogenomes
mkdir mitofish/mitoannotations
wget http://mitofish.aori.u-tokyo.ac.jp/files/mitoannotations.zip --directory-prefix=mitofish/
unzip mitofish/mitoannotations.zip -d mitofish/mitoannotations
## list of species with mitogenomes
for i in `ls mitofish/mitogenomes`;
do
echo $i | cut -d "." -f 1 | cut -d "_" -f 3-4;
done | sort | uniq | sed 's/_/ /g' > mitofish/species_mitogenomes.list
## list of species with mitoannotations
for i in `ls mitofish/mitoannotations/`;
do
echo $i | cut -d "." -f 1 | cut -d "_" -f 3-4;
done | sort | uniq | sed 's/_/ /g' > mitofish/species_mitoannotations.list
## list of species with mitogenomes AND mitoannotations
grep -f mitofish/species_mitoannotations.list mitofish/species_mitogenomes.list > mitofish/species_mitogenome_annotations.list
## extract 12S sequences
mkdir mitofish/complete
cat mitofish/species_mitogenome_annotations.list | while read i;
do
GENUS=`echo $i | awk '{ print $1}'`;
SPECIES=`echo $i | awk '{ print $2}'`;
GENUS_SPECIES=`echo $GENUS"_"$SPECIES`;
for j in `ls mitofish/mitogenomes/*$GENUS_SPECIES*`;
do
JFILE=`basename $j`
ln -s $(pwd)/mitofish/mitogenomes/$JFILE mitofish/complete/$JFILE;
done
for k in `ls mitofish/mitoannotations/*$GENUS_SPECIES*`;
do
KFILE=`basename $k`
ln -s $(pwd)/mitofish/mitoannotations/$KFILE mitofish/complete/$KFILE;
done
done
cat mitofish/complete/*fa > mitofish/mitogenomes.fa
mkdir mitofish/mitogenomes_12S
python2 scripts/extract_gene_from_mito.py -i mitofish/mitogenomes.fa -a mitofish/complete -o mitofish/mitogenomes_12S -g 12S -t rRNA -5 50 -3 50 -T TAXO/names.dmp
cat mitofish/mitogenomes_12S/*fa > mitofish/mitogene_12S.fasta
import Bio
from Bio import SeqIO
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import argparse
#===============================================================================
#ARGUMENTS
#===============================================================================
parser = argparse.ArgumentParser(description='mitochondrion extract gene')
parser.add_argument("-o","--output", type=str)
parser.add_argument("-i","--input",type=str)
parser.add_argument("-g","--geneName",type=str)
parser.add_argument("-t","--geneType",type=str)
parser.add_argument("-5","--sizeflanking5Region",type=int)
parser.add_argument("-3","--sizeflanking3Region",type=int)
parser.add_argument("-a","--annotations_repertory",type=str)
parser.add_argument("-T","--taxonomy",type=str)
#===============================================================================
#MAIN
#===============================================================================
args = parser.parse_args()
inputFile = args.input
queryGeneName=args.geneName
queryGeneType=args.geneType
flanking3Region=args.sizeflanking3Region
flanking5Region=args.sizeflanking5Region
dossierOutput=args.output
dossierAnnotations=args.annotations_repertory
taxonomyFile=args.taxonomy
"""
inputFile="actino_yersin_mitogenomes.fa"
queryGeneName="12S"
queryGeneType="rRNA"
flanking3Region=50
flanking5Region=50
dossierOutput="actino_yersin_12s_mito"
"""
dicSpeciesTaxID={}
with open(taxonomyFile,'r') as readTaxo:
for ligne in readTaxo.readlines():
ligneSplit=ligne.split("|")
taxid=int(ligneSplit[0])
speciesName=ligneSplit[1].strip()
dicSpeciesTaxID[speciesName]=taxid
print dicSpeciesTaxID
if queryGeneType != "gene":
for seq_record in SeqIO.parse(inputFile, "fasta",alphabet=IUPAC.unambiguous_dna):
localDescription=seq_record.description.split('|')
if len(localDescription)<3:
continue
#genusSpecies=localDescription[-1].replace(' ','_').replace('.','').replace('(','_').replace(')','_').replace('\'','_').replace(',','').replace("___","_").replace("__","_")
genusSpecies=" ".join(localDescription[-1].split(' ')[0:2])
if genusSpecies in dicSpeciesTaxID:
SpeciesTaxID=dicSpeciesTaxID[genusSpecies]
accessNCBI=localDescription[3].split('.')[0]
nomFichierMitoAnnotation=dossierAnnotations+"/"+accessNCBI+"_"+genusSpecies.replace(' ','_')+".txt"
mitoSequence=str(repr(str(seq_record.seq.lower()))).replace("'","")
sizeMitoSequence=len(mitoSequence)
try:
with open(nomFichierMitoAnnotation,'r') as readAnnot:
for ligne in readAnnot.readlines():
ligneSplit=ligne.split()[0:-1]
if ligneSplit[0] == queryGeneType and ligneSplit[3] == queryGeneName:
geneCoords=ligneSplit[1].split("..")
geneCoords= [int(i) for i in geneCoords]
wflankedGeneCoords = [geneCoords[0]-flanking5Region, geneCoords[1]+flanking3Region]
if wflankedGeneCoords[0]<0:
wflankedGeneCoords[0]=0
if wflankedGeneCoords[1] > sizeMitoSequence:
wflankedGeneCoords[1]=sizeMitoSequence
except:
print nomFichierMitoAnnotation+" not found"
continue
else:
print "no tax ID found for "+genusSpecies
continue
queryGeneSequence=mitoSequence[wflankedGeneCoords[0]:wflankedGeneCoords[1]]
local_seq=Seq(queryGeneSequence,IUPAC.unambiguous_dna)
#local_id=queryGeneName+"|"+"mitochrondrion"+"|"+accessNCBI+"|"+genusSpecies+"|"
local_id=accessNCBI+" gene="+queryGeneName+"; taxid="+str(SpeciesTaxID)+"; species_name="+genusSpecies+";"
local_description=" coords={0}-{1}; coords_with_flanking_regions={2}-{3}; flanking_sizes={4}_{5}bps;".format(geneCoords[0],geneCoords[1],wflankedGeneCoords[0],wflankedGeneCoords[1],flanking5Region,flanking3Region)
local_record=SeqRecord(local_seq,id=local_id, description=local_description)
nomFichierFasta=dossierOutput+"/"+queryGeneName+"_"+accessNCBI+"_"+genusSpecies.replace(" ","_")+".fa"
SeqIO.write(local_record,nomFichierFasta , "fasta")
else:
for seq_record in SeqIO.parse(inputFile, "fasta",alphabet=IUPAC.unambiguous_dna):
localDescription=seq_record.description.split('|')
if len(localDescription)<3:
continue
genusSpecies=localDescription[-1].replace(' ','_').replace('.','').replace('(','_').replace(')','_').replace('\'','_').replace(',','').replace("___","_").replace("__","_")
accessNCBI=localDescription[3].split('.')[0]
nomFichierMitoAnnotation=dossierAnnotations+"/"+accessNCBI+"_"+genusSpecies+".txt"
mitoSequence=str(repr(str(seq_record.seq.lower()))).replace("'","")
sizeMitoSequence=len(mitoSequence)
try:
with open(nomFichierMitoAnnotation,'r') as readAnnot:
geneFound=0
for ligne in readAnnot.readlines():
ligneSplit=ligne.split()
if ligneSplit[0] == queryGeneType and geneFound==0:
if ligneSplit[1] == queryGeneName:
geneFound = 1
elif geneFound ==1 and ligneSplit[0] == "CDS":
geneCoords=ligneSplit[1].split("..")
geneCoords= [int(i) for i in geneCoords]
wflankedGeneCoords = [geneCoords[0]-flanking5Region, geneCoords[1]+flanking3Region]
if wflankedGeneCoords[0]<0:
wflankedGeneCoords[0]=0
if wflankedGeneCoords[1] > sizeMitoSequence:
wflankedGeneCoords[1]=sizeMitoSequence
geneFound=0
break
except:
print nomFichierMitoAnnotation+" not found"
#print wflankedGeneCoords, geneCoords
queryGeneSequence=mitoSequence[wflankedGeneCoords[0]:wflankedGeneCoords[1]]
#print queryGeneSequence
local_seq=Seq(queryGeneSequence,IUPAC.unambiguous_dna)
local_id=queryGeneName+"|"+"mitochrondrion"+"|"+accessNCBI+"|"+genusSpecies+"|"
local_description="coords5'-3':{0}-{1} with flanking regions:{2}-{3} flanking sizes5'-3':{4}&{5}bps".format(geneCoords[0],geneCoords[1],wflankedGeneCoords[0],wflankedGeneCoords[1],flanking5Region,flanking3Region)
local_record=SeqRecord(local_seq,id=local_id, description=local_description)
nomFichierFasta=dossierOutput+"/"+queryGeneName+"_"+accessNCBI+"_"+genusSpecies+".fa"
SeqIO.write(local_record,nomFichierFasta , "fasta")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment