Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
eortega
coevolution
Commits
b0f115f0
Commit
b0f115f0
authored
Feb 21, 2020
by
eortega
Browse files
Added script to detect PAMs and to make a consensus reference with T0-WT
parent
8b70093a
Changes
34
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
b0f115f0
...
...
@@ -23,4 +23,4 @@
*multiqc_fastqc.txt
*multiqc_general_stats.txt
*multiqc_sources.txt
bacte*
phages/scripts/00_create_py_env.sh
View file @
b0f115f0
...
...
@@ -27,7 +27,8 @@ source ${py_env_dir}/bin/activate
pip
install
--upgrade
pip
##
pip
install
-r
scripts/requirements_py-env.txt
# pip install -r scripts/requirements_py-env.txt
pip
install
biopython multiqc numpy pandas ipython pyvcf
# pip install biopython pandas matplotlib multiqc pyvcf
...
...
phages/scripts/get_PAM.py
0 → 100644
View file @
b0f115f0
#!/usr/bin/python3
from
Bio
import
SeqIO
from
Bio.Seq
import
Seq
from
Bio.SeqRecord
import
SeqRecord
from
Bio.Alphabet
import
generic_dna
#source file
# seq_test = "/home/manager/Documents/analyse_bioinfo/genome2972/seq_test"
# sf = "/home/manager/Documents/analyse_bioinfo/genome2972/Genome2972_1line.txt"
fasta_file
=
'data/NC_007019.1_T0.fasta'
#output destination file
# df = "/home/manager/Documents/analyse_bioinfo/genome2972/output.txt"
df
=
'steps/PAM_detection/PAM_protospacers.fasta'
## MOTIF
pat
=
"AGAA"
pat2
=
"GGAA"
## Motif as sequence object
pat_seq
=
Seq
(
pat
,
generic_dna
)
pat2_seq
=
Seq
(
pat2
,
generic_dna
)
## Protospacer length is 30 + 2 nuc betweent the PAM and the protospacer
prot_len
=
32
## FASTA HEADER INFORMATION
## Fasta ID will be: PAM_CoordRef_genome
## CoordRef is the index of the left most letter of the motif
id_pre
=
'PAM'
id_post
=
fasta_file
.
replace
(
'data/'
,
''
).
replace
(
'.fasta'
,
''
)
## The description for the fasta record:
## Motif=xxxx;Strand=+;Protospacer_length=N
desc_motif
=
'Ḿotif='
desc_strand
=
'Strand='
desc_proto_len
=
'Protospacer_lenth='
+
str
(
prot_len
)
## Import sequence
# with open(sf, 'r') as f:
# texte = f.readlines()[0]
# texte = texte.upper()
ref
=
SeqIO
.
read
(
fasta_file
,
"fasta"
)
## SLIDING WINDOW
indices
=
[]
records
=
[]
count_pat_for
=
0
count_pat_crv
=
0
count_pat2_for
=
0
count_pat2_crv
=
0
for
i
in
range
(
0
,
len
(
ref
.
seq
)):
a
=
ref
.
seq
[
i
:
i
+
len
(
pat
)]
if
pat_seq
in
a
:
count_pat_for
+=
1
indices
.
append
(
i
)
prot_PAM_seq
=
ref
.
seq
[
i
-
prot_len
:
i
+
len
(
pat_seq
)]
record
=
SeqRecord
(
seq
=
prot_PAM_seq
,
\
id
=
'_'
.
join
([
id_pre
,
str
(
i
),
id_post
]),
\
description
=
';'
.
join
([
desc_motif
+
pat
,
desc_strand
+
'+'
,
desc_proto_len
])
)
records
.
append
(
record
)
elif
pat_seq
.
reverse_complement
()
in
a
:
count_pat_crv
+=
1
indices
.
append
(
i
)
prot_PAM_seq
=
ref
.
seq
[
i
:
i
+
len
(
pat_seq
)
+
prot_len
]
record
=
SeqRecord
(
seq
=
prot_PAM_seq
,
\
id
=
'_'
.
join
([
id_pre
,
str
(
i
),
id_post
]),
\
description
=
';'
.
join
([
desc_motif
+
pat
,
desc_strand
+
'-'
,
desc_proto_len
]))
records
.
append
(
record
)
elif
pat2_seq
in
a
:
count_pat2_for
+=
1
indices
.
append
(
i
)
prot_PAM_seq
=
ref
.
seq
[
i
-
prot_len
:
i
+
len
(
pat2_seq
)]
record
=
SeqRecord
(
seq
=
prot_PAM_seq
,
\
id
=
'_'
.
join
([
id_pre
,
str
(
i
),
id_post
]),
\
description
=
';'
.
join
([
desc_motif
+
pat2
,
desc_strand
+
'+'
,
desc_proto_len
])
)
records
.
append
(
record
)
elif
pat2_seq
.
reverse_complement
()
in
a
:
count_pat2_crv
+=
1
indices
.
append
(
i
)
prot_PAM_seq
=
ref
.
seq
[
i
:
i
+
len
(
pat2_seq
)
+
prot_len
]
record
=
SeqRecord
(
seq
=
prot_PAM_seq
,
\
id
=
'_'
.
join
([
id_pre
,
str
(
i
),
id_post
]),
\
description
=
';'
.
join
([
desc_motif
+
pat2
,
desc_strand
+
'-'
,
desc_proto_len
]))
records
.
append
(
record
)
with
open
(
df
,
'w'
)
as
handle
:
SeqIO
.
write
(
records
,
handle
,
'fasta'
)
print
(
"The Motif {0} was found {1} times in the forward strand
\
and {2} times in the reverse strand"
.
format
(
pat
,
\
count_pat_for
,
\
count_pat_crv
))
print
(
"The Motif {0} was found {1} times in the forward strand
\
and {2} times in the reverse strand
\n
"
.
format
(
pat2
,
\
count_pat2_for
,
\
count_pat2_crv
))
print
(
"There are in total {} fasta sequences"
.
format
(
len
(
records
)))
print
(
"The output file is: {}"
.
format
(
df
))
# with open(df, 'w') as dest:
# for i in indices:
# if i < 32:
# pass
# else:
# output = texte[i - 32 : i + len(pat)]
# print(output)
# dest.write(output + "\n")
phages/scripts/update_ref_genome.py
0 → 100644
View file @
b0f115f0
#!/usr/bin/python3
from
Bio
import
SeqIO
from
scripts
import
vcf_parser3
as
vp
import
numpy
as
np
from
Bio.Seq
import
Seq
from
Bio.Alphabet
import
generic_dna
## This script allows to create a consensus sequence froma
## FASTA file and a VCF file
## The equivalent in bcftools is:
## bcftools consensus --sample unknown -f NC_007019.1.fasta TO-WT_S83.vcf.gz -o test.fasta
## Input fasta file
fasta_file
=
"data/NC_007019.1.fasta"
## Output file
outfile
=
fasta_file
.
replace
(
'.fasta'
,
'_T0.fasta'
).
replace
(
'data'
,
'steps/new_ref_T0'
)
## Minimum haplotype frequency to replace REF genotype with ALT genotype
min_hap_freq
=
0.45
print
(
"
\n
Only mutations with an Allele Frequency higher than {} will be retained
\n
"
.
format
(
min_hap_freq
))
## Import sequence
ref
=
SeqIO
.
read
(
fasta_file
,
"fasta"
)
## Copy the sequence as a string into a variable
seq_str
=
str
(
ref
.
seq
)
## Import vcf to be used as a guide for replacements
vcf_file
=
vp
.
ImportVCF2
()
vcf_file
.
load_vcf
(
'data/TO-WT_S83.vcf'
)
counter
=
0
## Loop through the reversed list of indexes.
for
i
in
reversed
(
vcf_file
.
index
):
## Create a list containing the Allele Frequency for that Variant
my_list
=
list
(
np
.
array
(
vcf_file
.
INFO
[
i
][
'AO'
])
/
vcf_file
.
INFO
[
i
][
'DP'
])
## Extra info for the stdout
# print("Mutation ID= {} ; AF= {}".format(i, my_list))
## Filter out Variants with a low Allele Frequency.
## Only keep the Most frequent Variant
if
max
(
my_list
)
>
min_hap_freq
:
## Extra info for the stdout
# print("Retained Mutation")
counter
+=
1
## Get the index of the Variant (haplotype)
hap_index
=
my_list
.
index
(
max
(
my_list
))
## Get the position|locus for the variant with the higher AF.
## Remove 1 to make it fit with python indexes of strings
pos_str
=
vcf_file
.
POS
[
i
]
-
1
## Get the lenght of the Variant
length
=
len
(
vcf_file
.
REF
[
i
])
## Create a new string with the sequence Before the variant + Alt variant + sequence After variant
seq_str
=
seq_str
[:
pos_str
]
+
str
(
vcf_file
.
ALT
[
i
][
hap_index
])
+
seq_str
[
pos_str
+
length
:]
## Print stuff fot the user
print
(
"DP= {}, AO= {}"
.
format
(
vcf_file
.
INFO
[
i
][
'DP'
],
my_list
))
print
(
"Index= {} ; ALT= {} ; REF= {} ; POS= {} "
.
format
(
hap_index
,
vcf_file
.
ALT
[
i
][
hap_index
],
vcf_file
.
REF
[
i
],
vcf_file
.
POS
[
i
])
)
print
(
"POS= {} ; REF_found={} ;"
.
format
(
pos_str
+
1
,
seq_str
[
pos_str
:
pos_str
+
length
]
))
print
(
"REF= "
+
seq_str
[
pos_str
-
5
:
pos_str
].
lower
()
+
\
str
(
vcf_file
.
REF
[
i
])
+
\
seq_str
[
pos_str
+
length
:
pos_str
+
length
+
4
].
lower
()
)
print
(
"ALT= "
+
seq_str
[
pos_str
-
5
:
pos_str
].
lower
()
+
str
(
vcf_file
.
ALT
[
i
][
hap_index
])
+
seq_str
[
pos_str
+
length
:
pos_str
+
length
+
4
].
lower
()
+
'
\n
'
)
## Extra info for the stdout
# else:
# print("Not retained\n")
print
(
"Original length of sequence: {}"
.
format
(
len
(
ref
)))
print
(
"New sequence length : {}
\n
"
.
format
(
len
(
seq_str
)))
print
(
"Number of integrated Mutations : {}
\n
"
.
format
(
counter
))
## Convert sequence string into a sequence
seq_object
=
Seq
(
seq_str
,
'generic_dna'
)
## Replace attributes in original sequence object
ref
.
seq
=
seq_object
ref
.
id
=
ref
.
id
+
'_T0'
ref
.
description
=
ref
.
description
+
\
'. Added mutations accumulated before for the reference T0-WT.'
## Write new sequence to file
with
open
(
outfile
,
"w"
)
as
handle
:
SeqIO
.
write
(
ref
,
handle
,
"fasta"
)
print
(
"File saved in: {}"
.
format
(
outfile
))
###########################################################################
## TEST DUMMY EXAMPLES
###########################################################################
#### Sample string for working with indexes:
# asd = 'abcdefghijklmnopqrstuvwxyz'
# for i in range(0,len(asd)):
# print('{} : {}'.format(i, asd[i]))
# ind=4 # INDEX = pos_str
# l=1 # LENGTH of the pattern
# print(asd[ :ind] + 'E' + asd[ind + l: ]) #Slice before and after the required position
## With a longer pattern
# ind = 10
# l = 4
# asd[:ind] + 'KLMN' + asd[ind+l:]
#### Test sequence
# from Bio.Seq import Seq
# from Bio.Alphabet import generic_dna
# my_dna = Seq("AGTACACTGGT", generic_dna)
phages/steps/PAM_detection/log
0 → 100644
View file @
b0f115f0
The Motif AGAA was found 317 times in the forward strand and 96 times in the reverse strand
The Motif GGAA was found 208 times in the forward strand and 72 times in the reverse strand
There are in total 693 fasta sequences
The output file is: steps/PAM_detection/PAM_protospacers.fasta
phages/steps/mapping/README.md
View file @
b0f115f0
...
...
@@ -7,4 +7,4 @@ for i in $(find . -name *.bam);
do
echo
-ne
$i
'\t'
$(
samtools flagstat -@ 10
$i
|
grep
'mapped ('
)
'\n'
>>
mapping_flagstat.txt
;
done
```
\ No newline at end of file
```
phages/steps/new_ref_T0/log.txt
0 → 100644
View file @
b0f115f0
Only mutations with an Allele Frequency higher than 0.45 will be retained
DP= 545, AO= [1.0]
Index= 0 ; ALT= A ; REF= G ; POS= 34230
POS= 34230 ; REF_found=A ;
REF= cggacGgtct
ALT= cggacAgtct
DP= 108, AO= [0.009259259259259259, 0.09259259259259259, 0.009259259259259259, 0.8611111111111112]
Index= 3 ; ALT= TAAAATAAATAAATAAATAAATAAATATATATATAGAG ; REF= TAAAATAAATAAATAAATAAATATATATATAGAG ; POS= 32310
POS= 32310 ; REF_found=TAAAATAAATAAATAAATAAATAAATATATATAT ;
REF= atattTAAAATAAATAAATAAATAAATATATATATAGAGagag
ALT= atattTAAAATAAATAAATAAATAAATAAATATATATATAGAGagag
DP= 361, AO= [0.997229916897507]
Index= 0 ; ALT= A ; REF= G ; POS= 32245
POS= 32245 ; REF_found=A ;
REF= tcctcGaaag
ALT= tcctcAaaag
DP= 936, AO= [0.9989316239316239]
Index= 0 ; ALT= G ; REF= A ; POS= 14045
POS= 14045 ; REF_found=G ;
REF= gctggAagca
ALT= gctggGagca
DP= 904, AO= [0.9988938053097345]
Index= 0 ; ALT= T ; REF= G ; POS= 2952
POS= 2952 ; REF_found=T ;
REF= acagaGcaag
ALT= acagaTcaag
Original length of sequence: 34704
New sequence length : 34708
Number of integrated Mutations : 5
File saved in: steps/new_ref_T0/NC_007019.1_T0.fasta
phages/steps/qc_raw_data/multiqc/Other_seq/2972_data/multiqc_qualimap_bamqc_genome_results.txt
0 → 100644
View file @
b0f115f0
Sample
phages/steps/qc_raw_data/multiqc/Other_seq/A_data/multiqc_qualimap_bamqc_genome_results.txt
0 → 100644
View file @
b0f115f0
Sample
phages/steps/qc_raw_data/multiqc/Other_seq/B_data/multiqc_qualimap_bamqc_genome_results.txt
0 → 100644
View file @
b0f115f0
Sample
phages/steps/qc_raw_data/multiqc/Other_seq/CTRL_data/multiqc_qualimap_bamqc_genome_results.txt
0 → 100644
View file @
b0f115f0
Sample
phages/steps/qc_raw_data/multiqc/Other_seq/C_data/multiqc_qualimap_bamqc_genome_results.txt
0 → 100644
View file @
b0f115f0
Sample
phages/steps/qc_raw_data/multiqc/Other_seq/D_data/multiqc_qualimap_bamqc_genome_results.txt
0 → 100644
View file @
b0f115f0
Sample
phages/steps/qc_raw_data/multiqc/Other_seq/T_data/multiqc_qualimap_bamqc_genome_results.txt
0 → 100644
View file @
b0f115f0
Sample
phages/steps/qc_raw_data/multiqc/Other_seq/Undetermined_data/multiqc_qualimap_bamqc_genome_results.txt
0 → 100644
View file @
b0f115f0
Sample
phages/steps/qc_raw_data/multiqc/R_seq/R1_data/multiqc_qualimap_bamqc_genome_results.txt
0 → 100644
View file @
b0f115f0
Sample
phages/steps/qc_raw_data/multiqc/R_seq/R2_data/multiqc_qualimap_bamqc_genome_results.txt
0 → 100644
View file @
b0f115f0
Sample
phages/steps/qc_raw_data/multiqc/R_seq/R3_data/multiqc_qualimap_bamqc_genome_results.txt
0 → 100644
View file @
b0f115f0
Sample
phages/steps/qc_raw_data/multiqc/R_seq/R4_data/multiqc_qualimap_bamqc_genome_results.txt
0 → 100644
View file @
b0f115f0
Sample
phages/steps/qc_raw_data/multiqc/R_seq/R5_data/multiqc_qualimap_bamqc_genome_results.txt
0 → 100644
View file @
b0f115f0
Sample
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment