Commit 17473728 authored by peguerin's avatar peguerin
Browse files

fix validate

parent 845873fd
......@@ -68,6 +68,7 @@ def check_record_description(record_description):
idSample = splitted[0]
description = splitted[1]
if '=' in idSample:
## L'ID de la sequence ne doit pas comporter de caractere '=' car c'est attendu dans la description
return False
else:
if "species_name=" in description:
......@@ -75,13 +76,16 @@ def check_record_description(record_description):
descriptionRecord = DescriptionRecord(sampleid=idSample, species_name=speciesName)
return descriptionRecord
else:
## no species_name element in description
return False
else:
## the header doesn't have at least 2 colons separeted by ;
return False
def check_format_species_name(species_name):
'''
Check species name format
Excepted format "Genus species" or "Genus_species"
'''
patternSpeciesPerfect = "^[A-Z][a-z]+ [a-z]+$"
if re.match(patternSpeciesPerfect, species_name):
......@@ -96,7 +100,7 @@ def check_dna_sequence(dna_sequence, code="ATGC"):
Check DNA sequence format (iuapc ambiguity, gaps)
'''
for base in dna_sequence:
if base not in code:
if base not in code:
return False
return True
......@@ -104,17 +108,20 @@ def check_dna_sequence(dna_sequence, code="ATGC"):
def check_taxonomy_species_name(species_name, reference):
matching = { k:v for k,v in reference.items() if species_name in v }
matching = { k:v for k,v in reference.items() if species_name == v }
if len(matching) == 1:
rank = list(ncbi.get_rank(list(matching.keys())).values())[0]
if rank == "species":
return matching
else:
return "the rank is not species but {0}".format(rank)
## the rank in not "species"
return False
elif len(matching) > 1:
return "more than 1 matching species: {0}".format(eval(str(matching)))
## they are many matches so the species name is ambiguous
return False
else:
return "species name not found in ncbi"
## this species is unknown in ncbi
return False
def full_taxonomy_sample(sampleid, taxonid):
......@@ -132,6 +139,7 @@ def full_taxonomy_sample(sampleid, taxonid):
if idGenus and idFamily:
break
if idGenus is False or idFamily is False:
## Genus or Family is unknown in ncbi
return False
else:
genusName = str(ncbi.translate_to_names([idGenus])[0])
......@@ -147,8 +155,10 @@ def full_taxonomy_sample(sampleid, taxonid):
rank = "species")
return sample
else:
## The taxon taxonomy is unknown or empty
return False
else:
## This taxon have no taxonomy in ncbi
return False
......@@ -162,7 +172,7 @@ def full_taxonomy_sample(sampleid, taxonid):
##args
localTaxdumpArchive = "TAXO/taxdump_2021.tar.gz"
rawFastaFile="resources/test/raw.fasta"
rawFastaFile="teleo_ok.fasta"
ncbi = NCBITaxa()
......@@ -177,7 +187,8 @@ vertDic = dict(zip_iterator)
valideRecords = []
faultyRecords = []
faultyFormatRecords = []
faultyTaxonRecords = []
for record in SeqIO.parse(rawFastaFile, "fasta"):
#print(record.id)
......@@ -187,9 +198,10 @@ for record in SeqIO.parse(rawFastaFile, "fasta"):
if checkedRecord is not False:
checkedFormat = check_format_species_name(checkedRecord.species_name)
if checkedFormat is not False:
if check_dna_sequence(record.seq):
checkedTaxon = check_taxonomy_species_name(checkedFormat, vertDic)
if isinstance(checkedTaxon, dict):
if check_dna_sequence(record.seq):
checkedRecord.species_name = checkedFormat
checkedTaxon = check_taxonomy_species_name(checkedRecord.species_name, vertDic)
if checkedTaxon is not False:
#print(checkedTaxon)
sample = full_taxonomy_sample(checkedRecord.sampleid , list(checkedTaxon.keys())[0])
if sample is not False:
......@@ -197,33 +209,33 @@ for record in SeqIO.parse(rawFastaFile, "fasta"):
valideRecords.append(thisValideRecord)
else:
## faulty rank taxonomy
faultydescription = str(record.description)+' | faulty rank taxonomy: family or genera not found'
faultydescription = str(record.description)+'; faulty rank taxonomy: family or genera not found'
thisFaultyRecord = SeqRecord(id = record.id, description = faultydescription, seq=record.seq)
faultyRecords.append(thisFaultyRecord)
faultyTaxonRecords.append(thisFaultyRecord)
else:
## faulty taxonomy
faultydescription = str(record.description)+' | faulty taxonomy '+str(checkedTaxon)
faultydescription = str(record.description)+'; faulty taxonomy: species name '+ str(checkedRecord.species_name)+ ' not found in NCBI'
thisFaultyRecord = SeqRecord(id = record.id, description = faultydescription, seq=record.seq)
faultyRecords.append(thisFaultyRecord)
faultyTaxonRecords.append(thisFaultyRecord)
else:
faultydescription = str(record.description)+' | faulty DNA sequence'
faultydescription = str(record.description)+' ; faulty DNA sequence'
thisFaultyRecord = SeqRecord(id = record.id, description = faultydescription, seq=record.seq)
faultyRecords.append(thisFaultyRecord)
faultyFormatRecords.append(thisFaultyRecord)
else:
## faulty species name format
faultydescription = str(record.description)+' | faulty species name format ' + str(checkedRecord.species_name)
faultydescription = str(record.description)+' ; faulty species name format ' + str(checkedRecord.species_name)
thisFaultyRecord = SeqRecord(id = record.id, description = faultydescription, seq=record.seq)
faultyRecords.append(thisFaultyRecord)
faultyFormatRecords.append(thisFaultyRecord)
else:
## faulty record description format
print("ERROR FORMAT FASTA {0}: {1} {2}.\n Excepted format is\n\n>sampleid ; species_names=Genus Species\nACTAG\n".format(rawFastaFile,record.id, record.description))
## write faulty format records fasta
SeqIO.write(faultyFormatRecords, "faulty_format.fasta", "fasta")
## write faulty format records fasta
SeqIO.write(faultyTaxonRecords, "faulty_taxon.fasta", "fasta")
## write valide records fasta
SeqIO.write(valideRecords, "valide.fasta", "fasta")
## write faulty records fasta
SeqIO.write(faultyRecords, "faulty.fasta", "fasta")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment