Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
edna
custom_reference_database
Commits
17473728
Commit
17473728
authored
Mar 10, 2021
by
peguerin
Browse files
fix validate
parent
845873fd
Changes
1
Show whitespace changes
Inline
Side-by-side
valide.py
→
valid
at
e.py
View file @
17473728
...
...
@@ -68,6 +68,7 @@ def check_record_description(record_description):
idSample
=
splitted
[
0
]
description
=
splitted
[
1
]
if
'='
in
idSample
:
## L'ID de la sequence ne doit pas comporter de caractere '=' car c'est attendu dans la description
return
False
else
:
if
"species_name="
in
description
:
...
...
@@ -75,13 +76,16 @@ def check_record_description(record_description):
descriptionRecord
=
DescriptionRecord
(
sampleid
=
idSample
,
species_name
=
speciesName
)
return
descriptionRecord
else
:
## no species_name element in description
return
False
else
:
## the header doesn't have at least 2 colons separeted by ;
return
False
def
check_format_species_name
(
species_name
):
'''
Check species name format
Excepted format "Genus species" or "Genus_species"
'''
patternSpeciesPerfect
=
"^[A-Z][a-z]+ [a-z]+$"
if
re
.
match
(
patternSpeciesPerfect
,
species_name
):
...
...
@@ -104,17 +108,20 @@ def check_dna_sequence(dna_sequence, code="ATGC"):
def
check_taxonomy_species_name
(
species_name
,
reference
):
matching
=
{
k
:
v
for
k
,
v
in
reference
.
items
()
if
species_name
in
v
}
matching
=
{
k
:
v
for
k
,
v
in
reference
.
items
()
if
species_name
==
v
}
if
len
(
matching
)
==
1
:
rank
=
list
(
ncbi
.
get_rank
(
list
(
matching
.
keys
())).
values
())[
0
]
if
rank
==
"species"
:
return
matching
else
:
return
"the rank is not species but {0}"
.
format
(
rank
)
## the rank in not "species"
return
False
elif
len
(
matching
)
>
1
:
return
"more than 1 matching species: {0}"
.
format
(
eval
(
str
(
matching
)))
## they are many matches so the species name is ambiguous
return
False
else
:
return
"species name not found in ncbi"
## this species is unknown in ncbi
return
False
def
full_taxonomy_sample
(
sampleid
,
taxonid
):
...
...
@@ -132,6 +139,7 @@ def full_taxonomy_sample(sampleid, taxonid):
if
idGenus
and
idFamily
:
break
if
idGenus
is
False
or
idFamily
is
False
:
## Genus or Family is unknown in ncbi
return
False
else
:
genusName
=
str
(
ncbi
.
translate_to_names
([
idGenus
])[
0
])
...
...
@@ -147,8 +155,10 @@ def full_taxonomy_sample(sampleid, taxonid):
rank
=
"species"
)
return
sample
else
:
## The taxon taxonomy is unknown or empty
return
False
else
:
## This taxon have no taxonomy in ncbi
return
False
...
...
@@ -162,7 +172,7 @@ def full_taxonomy_sample(sampleid, taxonid):
##args
localTaxdumpArchive
=
"TAXO/taxdump_2021.tar.gz"
rawFastaFile
=
"resources/test/raw.fasta"
rawFastaFile
=
"teleo_ok.fasta"
ncbi
=
NCBITaxa
()
...
...
@@ -177,7 +187,8 @@ vertDic = dict(zip_iterator)
valideRecords
=
[]
faultyRecords
=
[]
faultyFormatRecords
=
[]
faultyTaxonRecords
=
[]
for
record
in
SeqIO
.
parse
(
rawFastaFile
,
"fasta"
):
#print(record.id)
...
...
@@ -188,8 +199,9 @@ for record in SeqIO.parse(rawFastaFile, "fasta"):
checkedFormat
=
check_format_species_name
(
checkedRecord
.
species_name
)
if
checkedFormat
is
not
False
:
if
check_dna_sequence
(
record
.
seq
):
checkedTaxon
=
check_taxonomy_species_name
(
checkedFormat
,
vertDic
)
if
isinstance
(
checkedTaxon
,
dict
):
checkedRecord
.
species_name
=
checkedFormat
checkedTaxon
=
check_taxonomy_species_name
(
checkedRecord
.
species_name
,
vertDic
)
if
checkedTaxon
is
not
False
:
#print(checkedTaxon)
sample
=
full_taxonomy_sample
(
checkedRecord
.
sampleid
,
list
(
checkedTaxon
.
keys
())[
0
])
if
sample
is
not
False
:
...
...
@@ -197,33 +209,33 @@ for record in SeqIO.parse(rawFastaFile, "fasta"):
valideRecords
.
append
(
thisValideRecord
)
else
:
## faulty rank taxonomy
faultydescription
=
str
(
record
.
description
)
+
'
|
faulty rank taxonomy: family or genera not found'
faultydescription
=
str
(
record
.
description
)
+
'
;
faulty rank taxonomy: family or genera not found'
thisFaultyRecord
=
SeqRecord
(
id
=
record
.
id
,
description
=
faultydescription
,
seq
=
record
.
seq
)
faultyRecords
.
append
(
thisFaultyRecord
)
faulty
Taxon
Records
.
append
(
thisFaultyRecord
)
else
:
## faulty taxonomy
faultydescription
=
str
(
record
.
description
)
+
'
|
faulty taxonomy '
+
str
(
checked
Taxon
)
faultydescription
=
str
(
record
.
description
)
+
'
;
faulty taxonomy
: species name
'
+
str
(
checked
Record
.
species_name
)
+
' not found in NCBI'
thisFaultyRecord
=
SeqRecord
(
id
=
record
.
id
,
description
=
faultydescription
,
seq
=
record
.
seq
)
faultyRecords
.
append
(
thisFaultyRecord
)
faulty
Taxon
Records
.
append
(
thisFaultyRecord
)
else
:
faultydescription
=
str
(
record
.
description
)
+
'
|
faulty DNA sequence'
faultydescription
=
str
(
record
.
description
)
+
'
;
faulty DNA sequence'
thisFaultyRecord
=
SeqRecord
(
id
=
record
.
id
,
description
=
faultydescription
,
seq
=
record
.
seq
)
faultyRecords
.
append
(
thisFaultyRecord
)
faulty
Format
Records
.
append
(
thisFaultyRecord
)
else
:
## faulty species name format
faultydescription
=
str
(
record
.
description
)
+
'
|
faulty species name format '
+
str
(
checkedRecord
.
species_name
)
faultydescription
=
str
(
record
.
description
)
+
'
;
faulty species name format '
+
str
(
checkedRecord
.
species_name
)
thisFaultyRecord
=
SeqRecord
(
id
=
record
.
id
,
description
=
faultydescription
,
seq
=
record
.
seq
)
faultyRecords
.
append
(
thisFaultyRecord
)
faulty
Format
Records
.
append
(
thisFaultyRecord
)
else
:
## faulty record description format
print
(
"ERROR FORMAT FASTA {0}: {1} {2}.
\n
Excepted format is
\n\n
>sampleid ; species_names=Genus Species
\n
ACTAG
\n
"
.
format
(
rawFastaFile
,
record
.
id
,
record
.
description
))
## write faulty format records fasta
SeqIO
.
write
(
faultyFormatRecords
,
"faulty_format.fasta"
,
"fasta"
)
## write faulty format records fasta
SeqIO
.
write
(
faultyTaxonRecords
,
"faulty_taxon.fasta"
,
"fasta"
)
## write valide records fasta
SeqIO
.
write
(
valideRecords
,
"valide.fasta"
,
"fasta"
)
## write faulty records fasta
SeqIO
.
write
(
faultyRecords
,
"faulty.fasta"
,
"fasta"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment