Commit b83d89cf authored by Enrique Ortega enrique.ortega@cefe.cnrs.fr's avatar Enrique Ortega enrique.ortega@cefe.cnrs.fr
Browse files

Lotte and Cornelia's scripts folder, first push

parent 29890785
Fri Mar 27 14:54:33 CET 2020
......@@ -27,8 +27,7 @@ source ${py_env_dir}/bin/activate
pip install --upgrade pip
##
# pip install -r scripts/requirements_py-env.txt
pip install biopython multiqc numpy pandas ipython pyvcf
pip install -r scripts/requirements_py-env.txt
# pip install biopython pandas matplotlib multiqc pyvcf
......
#!/bin/bash
## Launch example from coevolution/phages/
## ./scripts/01_quality_check.sh $PWD/ qc_rawd_data
## PATH TO WORKING DIRECTORY
path=$1
#path=/home/enrique/work/Gandon/coevolution/phages/
step_name=$2
## START ENVIRONMENT TO EXECUTE MULTIQC
source activate coevolution
## Some temporary files are created nearby the source files
## Using links from a tmp will avoid errors in case the directory
## containing the raw data is not writable
## SEPARATE THE FILES BY NAME
echo "CREATE TEMPORARY DIRECTORIES"
W_seq=$(mktemp -d)
R_seq=$(mktemp -d)
Other_seq=$(mktemp -d)
echo $W_seq $R_seq $Other_seq
### CREATE SYMBOLIC LINKS TO SEQUENCE FILES
### DIVIDED IN 3 DIRECTORIES
#echo "CREATE SYMBOLIC LINKS"
#for i in $(ls ${path}steps/trimming/W_seq/*);
#do
# ln -s $i $W_seq;
#done
#for i in $(ls ${path}steps/trimming/R_seq/*);
#do
# ln -s $i $R_seq;
#done
#for i in $(ls ${path}steps/trimming/Other_seq/*);
#do
# ln -s $i $Other_seq
#done
#echo "CREATE SYMBOLIC LINKS -- DONE"
### MAKE RUN FASTQC ON EACH GROUPE
mkdir -p ${path}steps/${step_name}/fastqc/{W_seq,R_seq,Other_seq}
fastqc -t 35 --noextract -o ${path}steps/$step_name/fastqc/W_seq ${path}/steps/trimming/W_seq/*
# multiqc -f -i W_seq -o ${path}qual/multiqc/ ${path}qual/fastqc/W_seq
fastqc -t 35 --noextract -o ${path}steps/$step_name/fastqc/R_seq ${path}/steps/trimming/R_seq/*
# multiqc -f -i R_seq -o ${path}qual/multiqc/ ${path}qual/fastqc/R_seq/
fastqc -t 35 --noextract -o ${path}steps/$step_name/fastqc/Other_seq ${path}/steps/trimming/Other_seq/*
# multiqc -f -i Other_seq -o ${path}qual/multiqc/ ${path}qual/fastqc/Other_seq
## MAKE MULTIQC
# multiqc -f -i W1_seq -n W1 -o ${path}qual/multiqc/ ${path}qual/fastqc/W_seq/W1*
## LOOP MULTIQC DEPENDING ON THE INPUTS
## MAKE DIRECTORIES FOR THE DIFFERENT SAMPLES:
mkdir -p ${path}steps/$step_name/multiqc/{W_seq,R_seq,Other_seq}
for i in $(seq 8)
do
multiqc -f -i W${i}_seq -n W${i} -o ${path}steps/$step_name/multiqc/W_seq ${path}steps/$step_name/fastqc/W_seq/W${i}*
multiqc -f -i R${i}_seq -n R${i} -o ${path}steps/$step_name/multiqc/R_seq ${path}steps/$step_name/fastqc/R_seq/R${i}*
done
for i in 2972 A B C D CTRL T Undetermined
do
multiqc -f -i ${i}_seq -n ${i} -o ${path}steps/$step_name/multiqc/Other_seq ${path}steps/$step_name/fastqc/Other_seq/${i}*
done
rm -rf $W_seq $R_seq $Other_seq
rm -r ${path}steps/$step_name/fastqc
echo Quality control is done > ${path}steps/$step_name/QC_done.txt
conda deactivate
......@@ -14,7 +14,7 @@ step_name=$2
## START ENVIRONMENT TO EXECUTE MULTIQC
source ~/envs/coev/bin/activate
source activate coevolution
## Some temporary files are created nearby the source files
## Using links from a tmp will avoid errors in case the directory
......@@ -51,7 +51,7 @@ done
for i in $(ls ${path}data/sequences/ | grep -v ^W | grep -v ^R);
do
#echo $i;
ln -s ${path}raw_data/sequences/$i $Other_seq
ln -s ${path}data/sequences/$i $Other_seq
done
......@@ -95,4 +95,8 @@ done
rm -rf $W_seq $R_seq $Other_seq
deactivate
rm -r ${path}steps/$step_name/fastqc
echo Quality control is done > ${path}steps/$step_name/QC_done.txt
conda deactivate
......@@ -49,6 +49,9 @@ touch ${trimm_summary}_W ${trimm_summary}_R ${trimm_summary}_Other
# done
## ACTIVATING ENVIRONMENT
source activate coevolution
## DO TRIMMING FOR W_SEQUENCES.
......@@ -58,7 +61,7 @@ do
shortname=$(basename -s _L001_R1_001.fastq.gz $i);
echo $shortname >> ${trimm_summary}_W;
echo "Working on file " $shortname;
java -jar /usr/local/src/Trimmomatic-0.38/trimmomatic-0.38.jar \
trimmomatic \
PE \
-threads $n_threads \
-phred33 \
......@@ -66,9 +69,9 @@ do
-quiet \
$i \
${i/_R1_/_R2_} \
${path}steps/$step_name/${shortname}_R1.fq.gz \
${path}steps/$step_name/W_seq/${shortname}_R1.fq.gz \
${path}steps/$step_name/uniq/U1.fq.gz \
${path}steps/$step_name/${shortname}_R2.fq.gz \
${path}steps/$step_name/W_seq/${shortname}_R2.fq.gz \
${path}steps/$step_name/uniq/U2.fq.gz \
CROP:145 \
HEADCROP:11 \
......@@ -85,7 +88,7 @@ do
shortname=$(basename -s _L001_R1_001.fastq.gz $i);
echo $shortname >> ${trimm_summary}_R;
echo "Working on file " $shortname;
java -jar /usr/local/src/Trimmomatic-0.38/trimmomatic-0.38.jar \
trimmomatic \
PE \
-threads $n_threads \
-phred33 \
......@@ -113,7 +116,7 @@ do
shortname=$(basename -s _L001_R1_001.fastq.gz $i);
echo $shortname >> ${trimm_summary}_Other;
echo "Working on file " $shortname;
java -jar /usr/local/src/Trimmomatic-0.38/trimmomatic-0.38.jar \
trimmomatic \
PE \
-threads $n_threads \
-phred33 \
......@@ -135,3 +138,7 @@ do
done
rm -rf ${path}steps/$step_name/uniq/
echo trimming is done > ${path}steps/$step_name/trimming_done.txt
conda deactivate
......@@ -26,6 +26,8 @@ ref=$4
# bowtie2 --phred33 -5 12 -p 35 -t -x ${path}data/refs/indexes_Sv/Sv -1 ${path}data/trimmed/W_seq/W4T3_S54_R1.fq.gz -2 ${path}data/trimmed/W_seq/W4T3_S54_R2.fq.gz -S ${path}results/test.sam
## ACTIVATING ENVIRONMENT
source activate coevolution
## CHECK FOR INDEXES
......@@ -38,18 +40,16 @@ if [ $? -eq 1 ];
fi
path_fasta=${path}steps/${prev_step}
mkdir = ${path}steps/${step_name}
path_results=${path}steps/${step_name}/
# bacteria_index=${path}data/refs/indexes_St/St
virus_index=${ref/.fasta}
for i in $(find $path_fasta -name *_R1.fq.gz)
do
## Declare local variables
......@@ -59,16 +59,28 @@ do
outdir=${var/data\/trimmed/results/mapping}/
## Give some feedback to the user
echo -e "\n"phage $root_name -\> ${outdir}${root_name}.sam
echo -e "\n"phage $root_name -\> ${path_results}${root_name}.sam
echo $i ${i/_R1/_R2}
echo $virus_index
## Mapping and indexing bam file
echo "#### MAPPING"
bowtie2 --phred33 -5 12 -p 24 -t -x $virus_index -1 $i -2 ${i/_R1/_R2} -S ${outdir}${root_name}.sam
bowtie2 --phred33 -5 12 -p 24 -t -x $virus_index -1 $i -2 ${i/_R1/_R2} -S ${path_results}${root_name}.sam
echo "#### SORTING"
samtools sort -O BAM -o ${outdir}${root_name}.sort.bam ${outdir}${root_name}.sam
samtools index -b ${outdir}${root_name}.sort.bam
samtools sort -O BAM -o ${path_results}${root_name}.sort.bam ${path_results}${root_name}.sam
echo "#### IDEXING"
samtools index -b ${path_results}${root_name}.sort.bam
done
mkdir -p ${path_results}W_seq
mkdir -p ${path_results}R_seq
mkdir -p ${path_results}Other_seq
mv ${path_results}W* ${path_results}W_seq
mv ${path_results}R* ${path_results}R_seq
mv ${path_results}*.* ${path_results}Other_seq
echo Mapping is done > ${path_results}mapping_done.txt
#!/bin/bash
## ACTIVATE ENVIRONMENT
source activate coevolution
## SNPCALLING
path=$1 ## Working directory's path
......@@ -8,8 +11,8 @@ prev_step=$3 ## name of step which created the bam files to
ref=$4 ## path to reference.fasta
echo ${path}steps/
mkdir -p ${path}steps/
mkdir -p ${path}steps/
for i in $(find ${path}steps/${prev_step}/ -type f -name *.bam)
do
......@@ -22,3 +25,16 @@ do
echo $(basename -s .sort.bam $i).vcf 'has been created';
done
conda deactivate
path_results=${path}steps/${step_name}/
mkdir -p ${path_results}W_seq
mkdir -p ${path_results}R_seq
mkdir -p ${path_results}Other_seq
mv ${path_results}W* ${path_results}W_seq
mv ${path_results}R* ${path_results}R_seq
mv ${path_results}*.* ${path_results}Other_seq
echo snp call is done > ${path}steps/${step_name}/snp_call_done.txt
$#!/bin/bash
## SNPCALLING GATK
path=$1 ## Working directory's path
step_name=$2
prev_step=$3 ## name of step which created the bam files to be used
ref=$4 ## path to reference.fasta
echo ${path}steps/
mkdir -p ${path}steps/
source activate coevolution
## Create fasta dict for GATK to work
gatk CreateSequenceDictionary -R $ref
for i in $(find ${path}steps/${prev_step}/ -type f -name *.bam)
do
od=$(dirname $i)
od=${od/$prev_step/$step_name}
mkdir -p $od
gatk --java-options "-Xmx4g" HaplotypeCaller -R $ref -I $i -O ${od}/$(basename -s .sort.bam $i).vcf
echo $(basename -s .sort.bam $i).vcf 'has been created';
gatk VariantsToTable --variant ${od}/$(basename -s .sort.bam $i).vcf -F POS -F TYPE -GF AD -O ${od}/$(basename -s .sort.bam $i).csv
done
conda deactivate
path_results=${path}steps/${step_name}/
mkdir -p ${path_results}W_seq
mkdir -p ${path_results}R_seq
mkdir -p ${path_results}Other_seq
mv ${path_results}W* ${path_results}W_seq
mv ${path_results}R* ${path_results}R_seq
mv ${path_results}*.* ${path_results}Other_seq
echo snp call is done > ${path}steps/${step_name}/snp_call_done.txt
#!/bin/bash
## SNPCALLING VarDict
path=$1 ## Working directory's path
step_name=$2
prev_step=$3 ## name of step which created the bam files to be used
ref=$4 ## path to reference.fasta
echo ${path}steps/
mkdir -p ${path}steps/
source activate vardict
for i in $(find ${path}steps/${prev_step}/ -type f -name *.bam)
do
od=$(dirname $i)
od=${od/$prev_step/$step_name}
mkdir -p $od
vardict-java -G $ref \
-th 16 \
-f 0.01 \
-N $(basename -s .sort.bam $i) \
-b $i \
-c 1 -S 2 -E 3 -g 4 \
${i%".sort.bam"}.bed \
| teststrandbias.R | \
var2vcf_valid.pl \
-N $(basename -s .sort.bam $i) \
-f 0.01 > ${od}/$(basename -s .sort.bam $i).vcf
echo $(basename -s .sort.bam $i).vcf 'has been created';
done
conda deactivate
path_results=${path}steps/${step_name}/
mkdir -p ${path_results}W_seq
mkdir -p ${path_results}R_seq
mkdir -p ${path_results}Other_seq
mv ${path_results}W* ${path_results}W_seq
mv ${path_results}R* ${path_results}R_seq
mv ${path_results}*.* ${path_results}Other_seq
echo snp call is done > ${path}steps/${step_name}/snp_call_done.txt
#! /home/enrique/envs/biopython/bin/python
import os
from vcf_parser3 import *
## Making folders for resulting plots
if not os.path.exists("plots"):
os.mkdir("plots")
if not os.path.exists("plots/W_seq"):
os.mkdir("plots/W_seq")
os.mkdir("plots/R_seq")
#########################################################
### TEST À METTRE DANS LA LIBRAIRIE
......@@ -54,9 +58,9 @@ def run_on_multiple_files(path_to_vcfs, outpath, list_files, list_headers, group
####
## LOAD CONTROL
####
ctrl_file = '/mnt/alpha_raid/work/Gandon/coevolution/phages/results/snpcalling/Other_seq/TO-WT_S83.vcf'
ctrl_file = 'steps/snpcall_freebayes/Other_seq/TO-WT_S83.vcf'
control = ImportControl2()
control = ImportVCF2()
control.load_vcf(ctrl_file)
......@@ -68,9 +72,9 @@ control.load_vcf(ctrl_file)
## DECLARE PATHS for W sequences
####
path_to_vcfs = '/mnt/alpha_raid/work/Gandon/coevolution/phages/results/snpcalling/W_seq/'
path_to_vcfs = 'steps/snpcall_freebayes/W_seq/'
outpath = '/home/enrique/work/Gandon/coevolution/phages/plots/freq_heatmap/W_seq/'
outpath = 'plots/W_seq/'
####
......@@ -156,9 +160,9 @@ foo = run_on_multiple_files(path_to_vcfs, outpath, w8, w8_headers, "W8", control
## DECLARE PATHS for R sequences
####
path_to_vcfs = '/mnt/alpha_raid/work/Gandon/coevolution/phages/results/snpcalling/R_seq/'
path_to_vcfs = 'steps/snpcall_freebayes/R_seq/'
outpath = '/home/enrique/work/Gandon/coevolution/phages/plots/freq_heatmap/R_seq/'
outpath = 'plots/R_seq/'
####
......@@ -219,3 +223,8 @@ foo = run_on_multiple_files(path_to_vcfs, outpath, r8, r8_headers, "R8", control
# run_on_multiple_files(path_to_vcfs, outpath, r6, r6_headers, "R6", control.ctrl_index)
# run_on_multiple_files(path_to_vcfs, outpath, r7, r7_headers, "R7", control.ctrl_index)
# run_on_multiple_files(path_to_vcfs, outpath, r8, r8_headers, "R8", control.ctrl_index)
file = open("plots/vcf_parser_done.txt", "w")
file.write("vcf parser is done")
file.close()
from vcf_parser3 import *
def run_on_multiple_files(path_to_vcfs, outpath, list_files, list_headers, group_name, control_list, file_root):
samples_indexes_list = []
for fichier in list_files :
print( path_to_vcfs + fichier )
temp = ImportVCF2()
temp.load_vcf(path_to_vcfs + fichier)
samples_indexes_list.append(temp.index)
the_plot = FrequencyOneSNP(samples_indexes_list, list_headers, control_list, temp, 'snp', outpath, file_root)
the_plot.plot_frequency_matrix(outpath + 'freq_plot_' + group_name + '.png', group_name)
return the_plot
#ctrl_file = '/mnt/alpha_raid/work/Gandon/coevolution/phages/results/snpcalling/Other_seq/TO-WT_S83.vcf'
ctrl_file='/mnt/alpha_raid/work/Gandon/coevolution/phages/steps/snpcalling/Other_seq/TO-WT_S83.vcf'
control = ImportControl2()
control.load_vcf(ctrl_file)
path_to_vcfs = '/mnt/alpha_raid/work/Gandon/coevolution/phages/steps/snpcalling/W_seq/'
outpath = '/mnt/alpha_raid/work/Gandon/coevolution/phages/plots/freq_heatmap/W_seq/'
w1 =['W1T1_S18.vcf', 'W1T2_S35.vcf', 'W1T3_S51.vcf', 'W1T4_S67.vcf']
std_headers = ['T1', 'T2', 'T3', 'T4']
w1_headers = std_headers
foo = run_on_multiple_files(path_to_vcfs, outpath, w1, w1_headers, "W1", control.index, 'w1_table')
......@@ -8,19 +8,30 @@ A more detailed description of the files is below the list (use ctrl + f)
Files:
* 00_create_py_env.sh
* prepping.sh*
* 01_quality_check.sh*
* 01_2_quality_check.sh*
* 02_trimm_and_clean.sh*
* 03_mapping.sh*
* 04_snpcalling.sh*
* 04a_FreeBayes.sh*
* 04b_GATK.sh
* 04c_VarDict.sh
* 05b_convert_protospacer_dico2fasta.py*
* 06b_blast_protospaces.sh*
* 07_2_run_vcf_parser_all_files.py
* 07_2_test.py
* 07_run_vcf_parser_all_files.py*
* vcf_to_csv.py
* update_ref_genome.py
* update_ref_geome_Wlog.py
* procedure.sh
* conda_procedure.sh
* README.md
* requirements_py-env.txt
* coevolution_env.yml
* vardict_env.yml
* vcf_parser3.py
......@@ -104,13 +115,24 @@ samtools index -b ${outdir}${root_name}.sort.bam
Creates a python virtual environment using `virtualenv`, the default python3 version of the system and will storte the environment in `~/envs/coev`. The installation of packages is done through pip.
### prepping.sh*
Prepares the data into subgroups.
Make conda environments based on .yml files.
Make links to data to make sub-groups
Outputs a text file, "prepping.txt", to let the user know when it is done.
prepping.sh is only executed when running through the conda_procedure.sh file and not the procedure.sh file.
### 01_quality_check.sh*
It will use FastQC to create quality control reports and then use multiqc to assemble the reports in only file. To make things easier, the input files are separated in 3 groups R, W and Other. These groups come from different treatments.
This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages/`. **Don't forget that final stroke**
### 01_2_quality_check.sh*
As 01_quality_check.sh, but is executed on the trimmed data. This is to evaluate the trimm and clean step below.
### 02_trimm_and_clean.sh*
......@@ -129,7 +151,7 @@ The mapper is bowtie2, after mapping the sam is sorted and converted to a bam an
This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages/`. **Don't forget that final stroke**
### 04_snpcalling.sh*
### 04a_FreeBayes.sh*
It uses freebayes to make the snp calling
......@@ -138,6 +160,13 @@ This script takes three arguments:
2 Path to the reference
3 Path to the output directory
### 04b_GATK.sh
It uses GATK to make the snp calling - not a part of current procedure!
### 04c_VarDict.sh
It uses VarDict to make the snp calling - not a part of current procedure!
### 05b_convert_protospacer_dico2fasta.py*
......@@ -181,80 +210,42 @@ To run from ipython:
%run 07_2_run_vcf_parser_all_files.py
```
----
### vcf_to_csv.py
Takes the vcf files from snp calling as input and outputs a csv file with information for further analysis. One csv file is generated for each population across timepoints. Thereby, 8 W and 8 R csv's are made.
The columns in the csv holds the following information:
## The other scripts
### procedure.sh
The commands used to launch the scripts up here as well as the supplementary commands to separate the different data, extract and all other action is written here.
It contains a pre-treatment of data to create sub-groups using symbolic links.
-- POS: The genomic position of the variant
-- TIME: T1, T2, T3 or T4
-- ALT: Alternative allele
-- REF: Reference allele
-- AO: Alternate allele observation count
-- DP: Read depth
-- TYPE: "snp", "mnp", ins", "del" or "complex"
-- FREQ: The allele frequency, filter >= 0.025
----
### update_ref_genome.py
Allows to create a consensus fasta file from the reference fasta
and a VCF file from which the variants will be integrated.
It filters the variants with a minimum frequency of 0.45.
It is equivalent to bcftools consensus:
`bcftools consensus --sample unknown -f NC_007019.1.fasta TO-WT_S83.vcf.gz -o test.fasta`
### get_PAM.py -- Getting new PAMs
From the genome of Streptococcus virus 2972 we'll be looking for new PAMs (Protospacer Associated Motif).
Takes a vcf made from snp calling, containing variants called in TO-WT. This vcf together with the original reference fasta sequence, is used to make an updated reference genome to be used for further snp calling.
The original scripts were made to teach Antoine Nicot how to program.
### update_ref_genome_Wlog.py
The researced patterns are: **AGAA** and **GGAA**; as well as the reverse complementary sequences **TTCT** and **TTCC**.
Same as above but outputs a log.txt in same location as fasta sequence. This log.txt contains information about the variants included in the updated reference.
The PAMS are little sequences of 4 nucleotides located on the 3' side of a protospacer sequence. The lenght of a protospacer taken into account is 32 + 4 (protospacer + PAM).