Commit 17473728 authored by peguerin's avatar peguerin
Browse files

fix validate

parent 845873fd
...@@ -68,6 +68,7 @@ def check_record_description(record_description): ...@@ -68,6 +68,7 @@ def check_record_description(record_description):
idSample = splitted[0] idSample = splitted[0]
description = splitted[1] description = splitted[1]
if '=' in idSample: if '=' in idSample:
## L'ID de la sequence ne doit pas comporter de caractere '=' car c'est attendu dans la description
return False return False
else: else:
if "species_name=" in description: if "species_name=" in description:
...@@ -75,13 +76,16 @@ def check_record_description(record_description): ...@@ -75,13 +76,16 @@ def check_record_description(record_description):
descriptionRecord = DescriptionRecord(sampleid=idSample, species_name=speciesName) descriptionRecord = DescriptionRecord(sampleid=idSample, species_name=speciesName)
return descriptionRecord return descriptionRecord
else: else:
## no species_name element in description
return False return False
else: else:
## the header doesn't have at least 2 colons separeted by ;
return False return False
def check_format_species_name(species_name): def check_format_species_name(species_name):
''' '''
Check species name format Check species name format
Excepted format "Genus species" or "Genus_species"
''' '''
patternSpeciesPerfect = "^[A-Z][a-z]+ [a-z]+$" patternSpeciesPerfect = "^[A-Z][a-z]+ [a-z]+$"
if re.match(patternSpeciesPerfect, species_name): if re.match(patternSpeciesPerfect, species_name):
...@@ -96,7 +100,7 @@ def check_dna_sequence(dna_sequence, code="ATGC"): ...@@ -96,7 +100,7 @@ def check_dna_sequence(dna_sequence, code="ATGC"):
Check DNA sequence format (iuapc ambiguity, gaps) Check DNA sequence format (iuapc ambiguity, gaps)
''' '''
for base in dna_sequence: for base in dna_sequence:
if base not in code: if base not in code:
return False return False
return True return True
...@@ -104,17 +108,20 @@ def check_dna_sequence(dna_sequence, code="ATGC"): ...@@ -104,17 +108,20 @@ def check_dna_sequence(dna_sequence, code="ATGC"):
def check_taxonomy_species_name(species_name, reference): def check_taxonomy_species_name(species_name, reference):
matching = { k:v for k,v in reference.items() if species_name in v } matching = { k:v for k,v in reference.items() if species_name == v }
if len(matching) == 1: if len(matching) == 1:
rank = list(ncbi.get_rank(list(matching.keys())).values())[0] rank = list(ncbi.get_rank(list(matching.keys())).values())[0]
if rank == "species": if rank == "species":
return matching return matching
else: else:
return "the rank is not species but {0}".format(rank) ## the rank in not "species"
return False
elif len(matching) > 1: elif len(matching) > 1:
return "more than 1 matching species: {0}".format(eval(str(matching))) ## they are many matches so the species name is ambiguous
return False
else: else:
return "species name not found in ncbi" ## this species is unknown in ncbi
return False
def full_taxonomy_sample(sampleid, taxonid): def full_taxonomy_sample(sampleid, taxonid):
...@@ -132,6 +139,7 @@ def full_taxonomy_sample(sampleid, taxonid): ...@@ -132,6 +139,7 @@ def full_taxonomy_sample(sampleid, taxonid):
if idGenus and idFamily: if idGenus and idFamily:
break break
if idGenus is False or idFamily is False: if idGenus is False or idFamily is False:
## Genus or Family is unknown in ncbi
return False return False
else: else:
genusName = str(ncbi.translate_to_names([idGenus])[0]) genusName = str(ncbi.translate_to_names([idGenus])[0])
...@@ -147,8 +155,10 @@ def full_taxonomy_sample(sampleid, taxonid): ...@@ -147,8 +155,10 @@ def full_taxonomy_sample(sampleid, taxonid):
rank = "species") rank = "species")
return sample return sample
else: else:
## The taxon taxonomy is unknown or empty
return False return False
else: else:
## This taxon have no taxonomy in ncbi
return False return False
...@@ -162,7 +172,7 @@ def full_taxonomy_sample(sampleid, taxonid): ...@@ -162,7 +172,7 @@ def full_taxonomy_sample(sampleid, taxonid):
##args ##args
localTaxdumpArchive = "TAXO/taxdump_2021.tar.gz" localTaxdumpArchive = "TAXO/taxdump_2021.tar.gz"
rawFastaFile="resources/test/raw.fasta" rawFastaFile="resources/test/raw.fasta"
rawFastaFile="teleo_ok.fasta"
ncbi = NCBITaxa() ncbi = NCBITaxa()
...@@ -177,7 +187,8 @@ vertDic = dict(zip_iterator) ...@@ -177,7 +187,8 @@ vertDic = dict(zip_iterator)
valideRecords = [] valideRecords = []
faultyRecords = [] faultyFormatRecords = []
faultyTaxonRecords = []
for record in SeqIO.parse(rawFastaFile, "fasta"): for record in SeqIO.parse(rawFastaFile, "fasta"):
#print(record.id) #print(record.id)
...@@ -187,9 +198,10 @@ for record in SeqIO.parse(rawFastaFile, "fasta"): ...@@ -187,9 +198,10 @@ for record in SeqIO.parse(rawFastaFile, "fasta"):
if checkedRecord is not False: if checkedRecord is not False:
checkedFormat = check_format_species_name(checkedRecord.species_name) checkedFormat = check_format_species_name(checkedRecord.species_name)
if checkedFormat is not False: if checkedFormat is not False:
if check_dna_sequence(record.seq): if check_dna_sequence(record.seq):
checkedTaxon = check_taxonomy_species_name(checkedFormat, vertDic) checkedRecord.species_name = checkedFormat
if isinstance(checkedTaxon, dict): checkedTaxon = check_taxonomy_species_name(checkedRecord.species_name, vertDic)
if checkedTaxon is not False:
#print(checkedTaxon) #print(checkedTaxon)
sample = full_taxonomy_sample(checkedRecord.sampleid , list(checkedTaxon.keys())[0]) sample = full_taxonomy_sample(checkedRecord.sampleid , list(checkedTaxon.keys())[0])
if sample is not False: if sample is not False:
...@@ -197,33 +209,33 @@ for record in SeqIO.parse(rawFastaFile, "fasta"): ...@@ -197,33 +209,33 @@ for record in SeqIO.parse(rawFastaFile, "fasta"):
valideRecords.append(thisValideRecord) valideRecords.append(thisValideRecord)
else: else:
## faulty rank taxonomy ## faulty rank taxonomy
faultydescription = str(record.description)+' | faulty rank taxonomy: family or genera not found' faultydescription = str(record.description)+'; faulty rank taxonomy: family or genera not found'
thisFaultyRecord = SeqRecord(id = record.id, description = faultydescription, seq=record.seq) thisFaultyRecord = SeqRecord(id = record.id, description = faultydescription, seq=record.seq)
faultyRecords.append(thisFaultyRecord) faultyTaxonRecords.append(thisFaultyRecord)
else: else:
## faulty taxonomy ## faulty taxonomy
faultydescription = str(record.description)+' | faulty taxonomy '+str(checkedTaxon) faultydescription = str(record.description)+'; faulty taxonomy: species name '+ str(checkedRecord.species_name)+ ' not found in NCBI'
thisFaultyRecord = SeqRecord(id = record.id, description = faultydescription, seq=record.seq) thisFaultyRecord = SeqRecord(id = record.id, description = faultydescription, seq=record.seq)
faultyRecords.append(thisFaultyRecord) faultyTaxonRecords.append(thisFaultyRecord)
else: else:
faultydescription = str(record.description)+' | faulty DNA sequence' faultydescription = str(record.description)+' ; faulty DNA sequence'
thisFaultyRecord = SeqRecord(id = record.id, description = faultydescription, seq=record.seq) thisFaultyRecord = SeqRecord(id = record.id, description = faultydescription, seq=record.seq)
faultyRecords.append(thisFaultyRecord) faultyFormatRecords.append(thisFaultyRecord)
else: else:
## faulty species name format ## faulty species name format
faultydescription = str(record.description)+' | faulty species name format ' + str(checkedRecord.species_name) faultydescription = str(record.description)+' ; faulty species name format ' + str(checkedRecord.species_name)
thisFaultyRecord = SeqRecord(id = record.id, description = faultydescription, seq=record.seq) thisFaultyRecord = SeqRecord(id = record.id, description = faultydescription, seq=record.seq)
faultyRecords.append(thisFaultyRecord) faultyFormatRecords.append(thisFaultyRecord)
else: else:
## faulty record description format ## faulty record description format
print("ERROR FORMAT FASTA {0}: {1} {2}.\n Excepted format is\n\n>sampleid ; species_names=Genus Species\nACTAG\n".format(rawFastaFile,record.id, record.description)) print("ERROR FORMAT FASTA {0}: {1} {2}.\n Excepted format is\n\n>sampleid ; species_names=Genus Species\nACTAG\n".format(rawFastaFile,record.id, record.description))
## write faulty format records fasta
SeqIO.write(faultyFormatRecords, "faulty_format.fasta", "fasta")
## write faulty format records fasta
SeqIO.write(faultyTaxonRecords, "faulty_taxon.fasta", "fasta")
## write valide records fasta ## write valide records fasta
SeqIO.write(valideRecords, "valide.fasta", "fasta") SeqIO.write(valideRecords, "valide.fasta", "fasta")
## write faulty records fasta
SeqIO.write(faultyRecords, "faulty.fasta", "fasta")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment