Commit b0f115f0 authored by eortega's avatar eortega
Browse files

Added script to detect PAMs and to make a consensus reference with T0-WT

parent 8b70093a
......@@ -23,4 +23,4 @@
*multiqc_fastqc.txt
*multiqc_general_stats.txt
*multiqc_sources.txt
bacte*
......@@ -27,7 +27,8 @@ source ${py_env_dir}/bin/activate
pip install --upgrade pip
##
pip install -r scripts/requirements_py-env.txt
# pip install -r scripts/requirements_py-env.txt
pip install biopython multiqc numpy pandas ipython pyvcf
# pip install biopython pandas matplotlib multiqc pyvcf
......
#!/usr/bin/python3
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_dna
#source file
# seq_test = "/home/manager/Documents/analyse_bioinfo/genome2972/seq_test"
# sf = "/home/manager/Documents/analyse_bioinfo/genome2972/Genome2972_1line.txt"
fasta_file = 'data/NC_007019.1_T0.fasta'
#output destination file
# df = "/home/manager/Documents/analyse_bioinfo/genome2972/output.txt"
df = 'steps/PAM_detection/PAM_protospacers.fasta'
## MOTIF
pat = "AGAA"
pat2 = "GGAA"
## Motif as sequence object
pat_seq = Seq(pat, generic_dna)
pat2_seq = Seq(pat2, generic_dna)
## Protospacer length is 30 + 2 nuc betweent the PAM and the protospacer
prot_len = 32
## FASTA HEADER INFORMATION
## Fasta ID will be: PAM_CoordRef_genome
## CoordRef is the index of the left most letter of the motif
id_pre = 'PAM'
id_post = fasta_file.replace('data/','').replace('.fasta', '')
## The description for the fasta record:
## Motif=xxxx;Strand=+;Protospacer_length=N
desc_motif = 'Ḿotif='
desc_strand = 'Strand='
desc_proto_len = 'Protospacer_lenth=' + str(prot_len)
## Import sequence
# with open(sf, 'r') as f:
# texte = f.readlines()[0]
# texte = texte.upper()
ref = SeqIO.read(fasta_file, "fasta")
## SLIDING WINDOW
indices = []
records = []
count_pat_for=0
count_pat_crv=0
count_pat2_for=0
count_pat2_crv=0
for i in range(0,len(ref.seq)):
a = ref.seq[i : i + len(pat)]
if pat_seq in a:
count_pat_for +=1
indices.append(i)
prot_PAM_seq = ref.seq[i - prot_len : i + len(pat_seq)]
record = SeqRecord( seq=prot_PAM_seq, \
id = '_'.join([id_pre, str(i), id_post]), \
description = ';'.join([desc_motif+pat, desc_strand+'+', desc_proto_len]) )
records.append(record)
elif pat_seq.reverse_complement() in a:
count_pat_crv +=1
indices.append(i)
prot_PAM_seq = ref.seq[i : i + len(pat_seq) + prot_len]
record = SeqRecord(seq=prot_PAM_seq, \
id = '_'.join([id_pre, str(i), id_post]), \
description = ';'.join([desc_motif+pat, desc_strand+'-', desc_proto_len]))
records.append(record)
elif pat2_seq in a:
count_pat2_for +=1
indices.append(i)
prot_PAM_seq = ref.seq[i - prot_len : i + len(pat2_seq)]
record = SeqRecord( seq=prot_PAM_seq, \
id = '_'.join([id_pre, str(i), id_post]), \
description = ';'.join([desc_motif+pat2, desc_strand+'+', desc_proto_len]) )
records.append(record)
elif pat2_seq.reverse_complement() in a :
count_pat2_crv +=1
indices.append(i)
prot_PAM_seq = ref.seq[i : i + len(pat2_seq) + prot_len]
record = SeqRecord(seq=prot_PAM_seq, \
id = '_'.join([id_pre, str(i), id_post]), \
description = ';'.join([desc_motif+pat2, desc_strand+'-', desc_proto_len]))
records.append(record)
with open(df, 'w') as handle:
SeqIO.write(records, handle, 'fasta')
print("The Motif {0} was found {1} times in the forward strand \
and {2} times in the reverse strand".format(pat, \
count_pat_for, \
count_pat_crv))
print("The Motif {0} was found {1} times in the forward strand \
and {2} times in the reverse strand\n".format(pat2, \
count_pat2_for, \
count_pat2_crv))
print("There are in total {} fasta sequences".format(len(records)))
print("The output file is: {}".format(df))
# with open(df, 'w') as dest:
# for i in indices:
# if i < 32:
# pass
# else:
# output = texte[i - 32 : i + len(pat)]
# print(output)
# dest.write(output + "\n")
#!/usr/bin/python3
from Bio import SeqIO
from scripts import vcf_parser3 as vp
import numpy as np
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
## This script allows to create a consensus sequence froma
## FASTA file and a VCF file
## The equivalent in bcftools is:
## bcftools consensus --sample unknown -f NC_007019.1.fasta TO-WT_S83.vcf.gz -o test.fasta
## Input fasta file
fasta_file = "data/NC_007019.1.fasta"
## Output file
outfile = fasta_file.replace('.fasta', '_T0.fasta').replace('data','steps/new_ref_T0')
## Minimum haplotype frequency to replace REF genotype with ALT genotype
min_hap_freq = 0.45
print("\nOnly mutations with an Allele Frequency higher than {} will be retained\n".format(min_hap_freq))
## Import sequence
ref = SeqIO.read(fasta_file, "fasta")
## Copy the sequence as a string into a variable
seq_str = str(ref.seq)
## Import vcf to be used as a guide for replacements
vcf_file = vp.ImportVCF2()
vcf_file.load_vcf('data/TO-WT_S83.vcf')
counter = 0
## Loop through the reversed list of indexes.
for i in reversed(vcf_file.index):
## Create a list containing the Allele Frequency for that Variant
my_list = list(np.array(vcf_file.INFO[i]['AO'])/vcf_file.INFO[i]['DP'])
## Extra info for the stdout
# print("Mutation ID= {} ; AF= {}".format(i, my_list))
## Filter out Variants with a low Allele Frequency.
## Only keep the Most frequent Variant
if max(my_list) > min_hap_freq:
## Extra info for the stdout
# print("Retained Mutation")
counter +=1
## Get the index of the Variant (haplotype)
hap_index = my_list.index(max(my_list))
## Get the position|locus for the variant with the higher AF.
## Remove 1 to make it fit with python indexes of strings
pos_str = vcf_file.POS[i] -1
## Get the lenght of the Variant
length = len(vcf_file.REF[i])
## Create a new string with the sequence Before the variant + Alt variant + sequence After variant
seq_str = seq_str[:pos_str] + str(vcf_file.ALT[i][hap_index]) + seq_str[pos_str + length :]
## Print stuff fot the user
print("DP= {}, AO= {}".format(vcf_file.INFO[i]['DP'], my_list))
print( "Index= {} ; ALT= {} ; REF= {} ; POS= {} ".format(hap_index, vcf_file.ALT[i][hap_index], vcf_file.REF[i], vcf_file.POS[i]) )
print( "POS= {} ; REF_found={} ;".format(pos_str+1, seq_str[pos_str:pos_str+length] ))
print("REF= " + seq_str[pos_str-5:pos_str].lower() + \
str(vcf_file.REF[i]) + \
seq_str[pos_str + length:pos_str + length + 4].lower() )
print("ALT= " + seq_str[pos_str-5:pos_str].lower() + str(vcf_file.ALT[i][hap_index]) + seq_str[pos_str + length : pos_str + length + 4].lower() + '\n')
## Extra info for the stdout
# else:
# print("Not retained\n")
print("Original length of sequence: {}".format(len(ref)))
print("New sequence length : {} \n".format(len(seq_str)))
print("Number of integrated Mutations : {}\n".format(counter))
## Convert sequence string into a sequence
seq_object = Seq(seq_str, 'generic_dna')
## Replace attributes in original sequence object
ref.seq = seq_object
ref.id = ref.id + '_T0'
ref.description = ref.description + \
'. Added mutations accumulated before for the reference T0-WT.'
## Write new sequence to file
with open(outfile, "w") as handle:
SeqIO.write(ref, handle, "fasta")
print("File saved in: {}".format(outfile))
###########################################################################
## TEST DUMMY EXAMPLES
###########################################################################
#### Sample string for working with indexes:
# asd = 'abcdefghijklmnopqrstuvwxyz'
# for i in range(0,len(asd)):
# print('{} : {}'.format(i, asd[i]))
# ind=4 # INDEX = pos_str
# l=1 # LENGTH of the pattern
# print(asd[ :ind] + 'E' + asd[ind + l: ]) #Slice before and after the required position
## With a longer pattern
# ind = 10
# l = 4
# asd[:ind] + 'KLMN' + asd[ind+l:]
#### Test sequence
# from Bio.Seq import Seq
# from Bio.Alphabet import generic_dna
# my_dna = Seq("AGTACACTGGT", generic_dna)
The Motif AGAA was found 317 times in the forward strand and 96 times in the reverse strand
The Motif GGAA was found 208 times in the forward strand and 72 times in the reverse strand
There are in total 693 fasta sequences
The output file is: steps/PAM_detection/PAM_protospacers.fasta
......@@ -7,4 +7,4 @@ for i in $(find . -name *.bam);
do
echo -ne $i'\t'$(samtools flagstat -@ 10 $i | grep 'mapped (')'\n' >> mapping_flagstat.txt;
done
```
\ No newline at end of file
```
Only mutations with an Allele Frequency higher than 0.45 will be retained
DP= 545, AO= [1.0]
Index= 0 ; ALT= A ; REF= G ; POS= 34230
POS= 34230 ; REF_found=A ;
REF= cggacGgtct
ALT= cggacAgtct
DP= 108, AO= [0.009259259259259259, 0.09259259259259259, 0.009259259259259259, 0.8611111111111112]
Index= 3 ; ALT= TAAAATAAATAAATAAATAAATAAATATATATATAGAG ; REF= TAAAATAAATAAATAAATAAATATATATATAGAG ; POS= 32310
POS= 32310 ; REF_found=TAAAATAAATAAATAAATAAATAAATATATATAT ;
REF= atattTAAAATAAATAAATAAATAAATATATATATAGAGagag
ALT= atattTAAAATAAATAAATAAATAAATAAATATATATATAGAGagag
DP= 361, AO= [0.997229916897507]
Index= 0 ; ALT= A ; REF= G ; POS= 32245
POS= 32245 ; REF_found=A ;
REF= tcctcGaaag
ALT= tcctcAaaag
DP= 936, AO= [0.9989316239316239]
Index= 0 ; ALT= G ; REF= A ; POS= 14045
POS= 14045 ; REF_found=G ;
REF= gctggAagca
ALT= gctggGagca
DP= 904, AO= [0.9988938053097345]
Index= 0 ; ALT= T ; REF= G ; POS= 2952
POS= 2952 ; REF_found=T ;
REF= acagaGcaag
ALT= acagaTcaag
Original length of sequence: 34704
New sequence length : 34708
Number of integrated Mutations : 5
File saved in: steps/new_ref_T0/NC_007019.1_T0.fasta
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment