diff --git a/phages/scripts/00_create_py_env.sh b/phages/scripts/00_create_py_env.sh old mode 100644 new mode 100755 index 8612ac94ebcff8f271c99e3dd09c3ffd85c63333..c5743f94046aeba27f2837db696d1305d77ec705 --- a/phages/scripts/00_create_py_env.sh +++ b/phages/scripts/00_create_py_env.sh @@ -27,7 +27,7 @@ source ${py_env_dir}/bin/activate pip install --upgrade pip ## -pip install -r requirements_py-env.txt +pip install -r scripts/requirements_py-env.txt # pip install biopython pandas matplotlib multiqc pyvcf diff --git a/phages/scripts/01_quality_check.sh b/phages/scripts/01_quality_check.sh index 74ba96742517c97c36aed20f43dbf48e4093c184..cf1fbf67b265e08118bcc02e62b6cd0e74e818ef 100755 --- a/phages/scripts/01_quality_check.sh +++ b/phages/scripts/01_quality_check.sh @@ -1,9 +1,14 @@ #!/bin/bash +## Launch example from coevolution/phages/ +## ./scripts/01_quality_check.sh $PWD/ qc_rawd_data + + ## PATH TO WORKING DIRECTORY path=$1 #path=/home/enrique/work/Gandon/coevolution/phages/ +step_name=$2 @@ -11,6 +16,9 @@ path=$1 source ~/envs/coev/bin/activate +## Some temporary files are created nearby the source files +## Using links from a tmp will avoid errors in case the directory +## containing the raw data is not writable ## SEPARATE THE FILES BY NAME @@ -20,47 +28,47 @@ W_seq=$(mktemp -d) R_seq=$(mktemp -d) Other_seq=$(mktemp -d) - +echo $W_seq $R_seq $Other_seq ### CREATE SYMBOLIC LINKS TO SEQUENCE FILES ### DIIVIDED IN 3 DIRECTORIES - echo "CREATE SYMBOLIC LINKS" -for i in $(ls ${path}raw_data/sequences/W*); +for i in $(ls ${path}data/sequences/W*); do ln -s $i $W_seq; done -for i in $(ls ${path}raw_data/sequences/R*); +for i in $(ls ${path}data/sequences/R*); do ln -s $i $R_seq; done -for i in $(ls ${path}raw_data/sequences/ | grep -v ^W | grep -v ^R); +for i in $(ls ${path}data/sequences/ | grep -v ^W | grep -v ^R); do #echo $i; ln -s ${path}raw_data/sequences/$i $Other_seq done +echo "CREATE SYMBOLIC LINKS -- DONE" ### MAKE RUN FASTQC ON EACH GROUPE -mkdir -p ${path}qual/fastqc/{W_seq,R_seq,Other_seq} +mkdir -p ${path}steps/${step_name}/fastqc/{W_seq,R_seq,Other_seq} -fastqc -t 35 --noextract -o ${path}qual/fastqc/W_seq $W_seq/* +fastqc -t 35 --noextract -o ${path}steps/$step_name/fastqc/W_seq $W_seq/* # multiqc -f -i W_seq -o ${path}qual/multiqc/ ${path}qual/fastqc/W_seq -fastqc -t 35 --noextract -o ${path}qual/fastqc/R_seq $R_seq/* +fastqc -t 35 --noextract -o ${path}steps/$step_name/fastqc/R_seq $R_seq/* # multiqc -f -i R_seq -o ${path}qual/multiqc/ ${path}qual/fastqc/R_seq/ -fastqc -t 35 --noextract -o ${path}qual/fastqc/Other_seq $Other_seq/* +fastqc -t 35 --noextract -o ${path}steps/$step_name/fastqc/Other_seq $Other_seq/* # multiqc -f -i Other_seq -o ${path}qual/multiqc/ ${path}qual/fastqc/Other_seq @@ -71,17 +79,17 @@ fastqc -t 35 --noextract -o ${path}qual/fastqc/Other_seq $Other_seq/* ## LOOP MULTIQC DEPENDING ON THE INPUTS ## MAKE DIRECTORIES FOR THE DIFFERENT SAMPLES: -mkdir -p ${path}qual/multiqc/{W,R,Other} +mkdir -p ${path}steps/$step_name/multiqc/{W_seq,R_seq,Other_seq} for i in $(seq 8) do - multiqc -f -i W${i}_seq -n W${i} -o ${path}qual/multiqc/W ${path}qual/fastqc/W_seq/W${i}* - multiqc -f -i R${i}_seq -n R${i} -o ${path}qual/multiqc/R ${path}qual/fastqc/R_seq/R${i}* + multiqc -f -i W${i}_seq -n W${i} -o ${path}steps/$step_name/multiqc/W_seq ${path}steps/$step_name/fastqc/W_seq/W${i}* + multiqc -f -i R${i}_seq -n R${i} -o ${path}steps/$step_name/multiqc/R_seq ${path}steps/$step_name/fastqc/R_seq/R${i}* done for i in 2972 A B C D CTRL T Undetermined do - multiqc -f -i ${i}_seq -n ${i} -o ${path}qual/multiqc/Other ${path}qual/fastqc/Other_seq/${i}* + multiqc -f -i ${i}_seq -n ${i} -o ${path}steps/$step_name/multiqc/Other_seq ${path}steps/$step_name/fastqc/Other_seq/${i}* done diff --git a/phages/scripts/02_trimm_and_clean.sh b/phages/scripts/02_trimm_and_clean.sh index 9bfb71da40774f491833259feec36cd0b4992a3e..d41f47034e216764791dc6fd0779048406d79e0c 100755 --- a/phages/scripts/02_trimm_and_clean.sh +++ b/phages/scripts/02_trimm_and_clean.sh @@ -1,9 +1,14 @@ #! /bin/bash +## VARIABLES -path=/home/enrique/work/Gandon/coevolution/phages/ +#path=/home/enrique/work/Gandon/coevolution/phages/ +path=$1 +n_threads=35 +## SCRIPT + mkdir -p data/trimmed/{W_seq,R_seq,Other_seq} trimm_summary=${path}data/summary_trimm @@ -45,7 +50,7 @@ do echo "Working on file " $shortname; java -jar /usr/local/src/Trimmomatic-0.38/trimmomatic-0.38.jar \ PE \ - -threads 35 \ + -threads $n_threads \ -phred33 \ -summary /tmp/tmp.trimm_summary \ -quiet \ @@ -72,7 +77,7 @@ do echo "Working on file " $shortname; java -jar /usr/local/src/Trimmomatic-0.38/trimmomatic-0.38.jar \ PE \ - -threads 35 \ + -threads $n_threads \ -phred33 \ -summary /tmp/tmp.trimm_summary \ -quiet \ @@ -100,7 +105,7 @@ do echo "Working on file " $shortname; java -jar /usr/local/src/Trimmomatic-0.38/trimmomatic-0.38.jar \ PE \ - -threads 35 \ + -threads $n_threads \ -phred33 \ -summary /tmp/tmp.trimm_summary \ -quiet \ diff --git a/phages/scripts/03_mapping.sh b/phages/scripts/03_mapping.sh index 8b78f7155608e6dcb2c5738a678476f67b86d4cd..d32775e4ce11b7759d5476a578bfdcd4a06855e7 100755 --- a/phages/scripts/03_mapping.sh +++ b/phages/scripts/03_mapping.sh @@ -2,7 +2,8 @@ ## DEFINE PATH -path=/home/enrique/work/Gandon/coevolution/phages/ +path=$1 +# path=/home/enrique/work/Gandon/coevolution/phages/ ## CREATE INDEXES AND @@ -13,32 +14,34 @@ path=/home/enrique/work/Gandon/coevolution/phages/ # bowtie2 --phred33 -5 12 -p 35 -t -x ${path}data/refs/indexes_Sv/Sv -1 ${path}data/trimmed/W_seq/W4T3_S54_R1.fq.gz -2 ${path}data/trimmed/W_seq/W4T3_S54_R2.fq.gz -S ${path}results/test.sam -path_fasta=/home/enrique/work/Gandon/coevolution/phages/data/trimmed/ +path_fasta=${path}data/trimmed/ -path_results=/home/enrique/work/Gandon/coevolution/phages/results/ +path_results=${path}results/ -bacteria_index=/home/enrique/work/Gandon/coevolution/phages/data/refs/indexes_St/St +bacteria_index=${path}data/refs/indexes_St/St -virus_index=/home/enrique/work/Gandon/coevolution/phages/data/refs/indexes_Sv/Sv +virus_index=${path}data/refs/indexes_Sv/Sv - -for i in $(find $path_fasta -name *_R1.fq.gz) +for i in $(find $path_fasta -name *_R1.fq.gz) do + ## Declare local variables # echo $i - root_name=$(basename -s _R1.fq.gz $i) + root_name=$(basename -s _R1.fq.gz $i) var=$(dirname $i) outdir=${var/data\/trimmed/results/mapping}/ - echo -e "\n"phage $root_name -\> ${outdir}${root_name}.sam - echo $i ${i/_R1/_R2} + ## Give some feedback to the user + echo -e "\n"phage $root_name -\> ${outdir}${root_name}.sam + echo $i ${i/_R1/_R2} echo $virus_index + ## Mapping and indexing bam file echo "#### MAPPING" - bowtie2 --phred33 -5 12 -p 24 -t -x $virus_index -1 $i -2 ${i/_R1/_R2} -S ${outdir}${root_name}.sam + bowtie2 --phred33 -5 12 -p 24 -t -x $virus_index -1 $i -2 ${i/_R1/_R2} -S ${outdir}${root_name}.sam echo "#### SORTING" - samtools sort -O BAM -o ${outdir}${root_name}.sort.bam ${outdir}${root_name}.sam - samtools index -b ${outdir}${root_name}.sort.bam + samtools sort -O BAM -o ${outdir}${root_name}.sort.bam ${outdir}${root_name}.sam + samtools index -b ${outdir}${root_name}.sort.bam done diff --git a/phages/scripts/04_snpcalling.sh b/phages/scripts/04_snpcalling.sh index fe9a12193666c6323d580568effc4e315c9a4704..470e1d9a0fa2831d4be035ce422d77647ed69fd4 100755 --- a/phages/scripts/04_snpcalling.sh +++ b/phages/scripts/04_snpcalling.sh @@ -3,7 +3,7 @@ ## SNPCALLING wd=$1 ## Working directory's path -ref=$2 ## path to rference.fasta +ref=$2 ## path to reference.fasta od=$3 ## Output directory diff --git a/phages/scripts/06b_blast_protospaces.sh b/phages/scripts/06b_blast_protospaces.sh old mode 100755 new mode 100644 diff --git a/phages/scripts/README.md b/phages/scripts/README.md new file mode 100644 index 0000000000000000000000000000000000000000..39441b2ce124f21695b0cfac23db399dc954c92f --- /dev/null +++ b/phages/scripts/README.md @@ -0,0 +1,237 @@ +## Contents of scripts folder + +The scripts are numbered in the order they are run. +If in doubt use the scripts which are executable. + +A more detailed description of the files is below the list (use ctrl + f) + +Files: + +* 00_create_py_env.sh +* 01_quality_check.sh* +* 02_trimm_and_clean.sh* +* 03_mapping.sh* +* 04_snpcalling.sh* +* 05b_convert_protospacer_dico2fasta.py* +* 06b_blast_protospaces.sh* +* 07_2_run_vcf_parser_all_files.py +* 07_2_test.py +* 07_run_vcf_parser_all_files.py* + +* procedure.sh +* README.md +* requirements_py-env.txt + +* vcf_parser3.py + +Folders: + +* debug +* lib +* \_\_pycache\_\_ + +---- + +## Coding practices + +In python I tried to use as much as possible the Python Enhancement Proposal 8 (PEP-8). https://www.python.org/dev/peps/pep-0008/ + +A difference I use regularly is using double `##` at the begining of a line containing *informative comments*. +During the developement stages I comment some code lines that would be uncommented as a block. Having two '#' prevents comments to be executed as code. + +Example: + +```python +## This block of code calculate the proportion +for i in input_list: + # print("proportion of the list") + print(i / sum(list)) +``` + +Concerning **bash** coding I use often double spaces to separate commands, parameters, and arguments. When using some long names it makes things more readable + +Big chunks of code are commented with capitals and short phrases, +whereas longer phrases in comments are in lower case. + +Example: + +```bash +## BIG CODE CHUNK + +for i in $(find $path_fasta -name *_R1.fq.gz) +do + ## Declare local variables + # echo $i + root_name=$(basename -s _R1.fq.gz $i) + var=$(dirname $i) + outdir=${var/data\/trimmed/results/mapping}/ + + ## Give some feedback to the user + echo -e "\n"phage $root_name -\> ${outdir}${root_name}.sam + echo $i ${i/_R1/_R2} + echo $virus_index + + ## Mapping and indexing bam file + echo "#### MAPPING" + bowtie2 --phred33 -5 12 -p 24 -t -x $virus_index -1 $i -2 ${i/_R1/_R2} -S ${outdir}${root_name}.sam + + echo "#### SORTING" + samtools sort -O BAM -o ${outdir}${root_name}.sort.bam ${outdir}${root_name}.sam + samtools index -b ${outdir}${root_name}.sort.bam + +done +``` + +Sometimes I'll put the arguments in different lines having an indentation: + +```bash +## A command with multiple parameters seaprated per line +samtools sort \ + -O BAM \ + -o ${outdir}${root_name}.sort.bam \ + ${outdir}${root_name}.sam + +## Short commands in one single line +samtools index -b ${outdir}${root_name}.sort.bam +``` + +---- + +## File descriptions + + +### 00_create_py_env.sh + +Creates a python virtual environment using `virtualenv`, the default python3 version of the system and will storte the environment in `~/envs/coev`. The installation of packages is done through pip. + + +### 01_quality_check.sh* + +It will use FastQC to create quality control reports and then use multiqc to assemble the reports in only file. To make things easier, the input files are separated in 3 groups R, W and Other. These groups come from different treatments. + +This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages/`. **Don't forget that final stroke** + + +### 02_trimm_and_clean.sh* + +Launches Trimmomatic to clean data. +The parameters are embeded in the code -- for now + +This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages/`. **Don't forget that final stroke** + + +### 03_mapping.sh* + +The index creation is commented in the top of the script. It's only required once. +The path to the input files is a full path using a variable. +The mapper is bowtie2, after mapping the sam is sorted and converted to a bam and indexed so it's ready for the next stage. + +This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages/`. **Don't forget that final stroke** + + +### 04_snpcalling.sh* + +It uses freebayes to make the snp calling + +This script takes three arguments: +1 The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages`. **Don't forget that final stroke** +2 Path to the reference +3 Path to the output directory + + +### 05b_convert_protospacer_dico2fasta.py* + +The *protospacer_dico* is a raw text file with only one sequence per line. +Each line is a protospacer sequence that has been manually selected and curated. +This script converts these sequences to fasta format. +The name given to each sequence is the line number. + +No arguments. the path to the file is hard written -- To be corrected + + +### 06b_blast_protospaces.sh* + +Protospacers come from bacteria, we need to know where do they map on the virus' genome. + +We tried out different parameters, this script is more a reminder of the commands and parameters tested. -- To be corrected + + +## 07_2_run_vcf_parser_all_files.py + +This is the current script being used, the previous are old versions, before starting a git. + +* 07_2_test.py : This is for debuging. The inputs and outputs are in the folder `scripts/debug` +* 07_run_vcf_parser_all_files.py* + +This is the analysis script. +It describes the experimental planes to comapare data and make the graphics that were requested: binary and frequency heatmaps. +It requires the file `vcf_parser3.py` to be located in the same directory. + +The analysis is defined in a function in the top of the script. + +To run it I use ipython: + +```bash +source ~/envs/coev/bin/activate +pip install ipython +``` + +To run from ipython: +``` +%run 07_2_run_vcf_parser_all_files.py +``` + +---- + + + +## The other scripts + +### procedure.sh + +The commands used to launch the scripts up here as well as the supplementary commands to separate the different data, extract and all other action is written here. + +It contains a pre-treatment of data to create sub-groups using symbolic links. + + +### README.md + +This file :-P + + +### requirements_py-env.txt + +File used by the script `00_create_py_env.sh` +It contains the list of libraries and versions used in python for this analysis. + + +### vcf_parser3.py + +This is a python library which is imported by `07_2_run_vcf_parser_all_files.py` + +It contains allows to import data from vcf files and put it in a pandas dataframe format to ease the analyses. + +It works with objects: The _attributes_ contain information and the _methods_ are internal functions for that object. For example "remove control SNPs" + +The last class allows to create a heatmap with matplotlib. + +A more extended documentation will be made in the future + + +---- + +## Folders + +### debug + +Folder to put inputs and outputs for debuging. + +### lib + +Folder containing the library of some tests, +toy examples and commands to be called from the other scripts + +### * \_\_pycache\_\_ + +Dispensable. +Created by python automatically when importing a `*.py` diff --git a/phages/scripts/procedure.sh b/phages/scripts/procedure.sh new file mode 100644 index 0000000000000000000000000000000000000000..153f01580c84be500dd9bc73cacd0cac0daed676 --- /dev/null +++ b/phages/scripts/procedure.sh @@ -0,0 +1,53 @@ +#! /bin/bash + + +## SET THE PATH TO THE PHAGES DIRECTORY +# cd /home/user/work/coev/phages + + +## PREPARE DATA SUBGROUPS + +## Uncompress raw data into data folder +tar -xzvf data/sequences.tar.gz -C data + +## Change premissions to avoid accidents +chmod -w data/sequences/* +chmod -w data/sequences.tar.gz + +## Make links to data to make sub-groups +## It makes it easier to handle groups of files + + +## This step is also done on scripts/01_quality_check.sh +mkdir -p data/fastq_ln/{R_seq,W_seq,Other_seq} + +for i in $PWD/data/sequences/R* +do + ln -s $i $PWD/data/fastq_ln/R_seq +done + + +for i in $PWD/data/sequences/W* +do + ln -s $i $PWD/data/fastq_ln/W_seq +done + + +for i in $(ls raw_data/sequences/ | grep -v -E "^W.*.fastq.gz|^R.*.fastq.gz") +do + ln -s $PWD/raw_data/sequences/$i $PWD/data/fastq_ln/Other_seq +done + + + +################################### +## CREATE PYTHON ENVIRONMENT -- With virtualenv + +./scripts/00_create_py_env.sh +## environment located in: +## ~/envs/coev/ + +################################### +## QUALITY CHECK + +./scripts/01_quality_check.sh $PWD/ qc_raw_data