Commit 9f181196 authored by eortega's avatar eortega
Browse files

Merge branch 'Check_run_post_restructure' into 'master'

Check run post restructure

See merge request !3
parents e8252b78 a0e243ef
......@@ -27,7 +27,7 @@ source ${py_env_dir}/bin/activate
pip install --upgrade pip
##
pip install -r requirements_py-env.txt
pip install -r scripts/requirements_py-env.txt
# pip install biopython pandas matplotlib multiqc pyvcf
......
#!/bin/bash
## Launch example from coevolution/phages/
## ./scripts/01_quality_check.sh $PWD/ qc_rawd_data
## PATH TO WORKING DIRECTORY
path=$1
#path=/home/enrique/work/Gandon/coevolution/phages/
step_name=$2
......@@ -11,6 +16,9 @@ path=$1
source ~/envs/coev/bin/activate
## Some temporary files are created nearby the source files
## Using links from a tmp will avoid errors in case the directory
## containing the raw data is not writable
## SEPARATE THE FILES BY NAME
......@@ -20,47 +28,47 @@ W_seq=$(mktemp -d)
R_seq=$(mktemp -d)
Other_seq=$(mktemp -d)
echo $W_seq $R_seq $Other_seq
### CREATE SYMBOLIC LINKS TO SEQUENCE FILES
### DIIVIDED IN 3 DIRECTORIES
echo "CREATE SYMBOLIC LINKS"
for i in $(ls ${path}raw_data/sequences/W*);
for i in $(ls ${path}data/sequences/W*);
do
ln -s $i $W_seq;
done
for i in $(ls ${path}raw_data/sequences/R*);
for i in $(ls ${path}data/sequences/R*);
do
ln -s $i $R_seq;
done
for i in $(ls ${path}raw_data/sequences/ | grep -v ^W | grep -v ^R);
for i in $(ls ${path}data/sequences/ | grep -v ^W | grep -v ^R);
do
#echo $i;
ln -s ${path}raw_data/sequences/$i $Other_seq
done
echo "CREATE SYMBOLIC LINKS -- DONE"
### MAKE RUN FASTQC ON EACH GROUPE
mkdir -p ${path}qual/fastqc/{W_seq,R_seq,Other_seq}
mkdir -p ${path}steps/${step_name}/fastqc/{W_seq,R_seq,Other_seq}
fastqc -t 35 --noextract -o ${path}qual/fastqc/W_seq $W_seq/*
fastqc -t 35 --noextract -o ${path}steps/$step_name/fastqc/W_seq $W_seq/*
# multiqc -f -i W_seq -o ${path}qual/multiqc/ ${path}qual/fastqc/W_seq
fastqc -t 35 --noextract -o ${path}qual/fastqc/R_seq $R_seq/*
fastqc -t 35 --noextract -o ${path}steps/$step_name/fastqc/R_seq $R_seq/*
# multiqc -f -i R_seq -o ${path}qual/multiqc/ ${path}qual/fastqc/R_seq/
fastqc -t 35 --noextract -o ${path}qual/fastqc/Other_seq $Other_seq/*
fastqc -t 35 --noextract -o ${path}steps/$step_name/fastqc/Other_seq $Other_seq/*
# multiqc -f -i Other_seq -o ${path}qual/multiqc/ ${path}qual/fastqc/Other_seq
......@@ -71,17 +79,17 @@ fastqc -t 35 --noextract -o ${path}qual/fastqc/Other_seq $Other_seq/*
## LOOP MULTIQC DEPENDING ON THE INPUTS
## MAKE DIRECTORIES FOR THE DIFFERENT SAMPLES:
mkdir -p ${path}qual/multiqc/{W,R,Other}
mkdir -p ${path}steps/$step_name/multiqc/{W_seq,R_seq,Other_seq}
for i in $(seq 8)
do
multiqc -f -i W${i}_seq -n W${i} -o ${path}qual/multiqc/W ${path}qual/fastqc/W_seq/W${i}*
multiqc -f -i R${i}_seq -n R${i} -o ${path}qual/multiqc/R ${path}qual/fastqc/R_seq/R${i}*
multiqc -f -i W${i}_seq -n W${i} -o ${path}steps/$step_name/multiqc/W_seq ${path}steps/$step_name/fastqc/W_seq/W${i}*
multiqc -f -i R${i}_seq -n R${i} -o ${path}steps/$step_name/multiqc/R_seq ${path}steps/$step_name/fastqc/R_seq/R${i}*
done
for i in 2972 A B C D CTRL T Undetermined
do
multiqc -f -i ${i}_seq -n ${i} -o ${path}qual/multiqc/Other ${path}qual/fastqc/Other_seq/${i}*
multiqc -f -i ${i}_seq -n ${i} -o ${path}steps/$step_name/multiqc/Other_seq ${path}steps/$step_name/fastqc/Other_seq/${i}*
done
......
#! /bin/bash
## VARIABLES
path=/home/enrique/work/Gandon/coevolution/phages/
#path=/home/enrique/work/Gandon/coevolution/phages/
path=$1
n_threads=35
## SCRIPT
mkdir -p data/trimmed/{W_seq,R_seq,Other_seq}
trimm_summary=${path}data/summary_trimm
......@@ -45,7 +50,7 @@ do
echo "Working on file " $shortname;
java -jar /usr/local/src/Trimmomatic-0.38/trimmomatic-0.38.jar \
PE \
-threads 35 \
-threads $n_threads \
-phred33 \
-summary /tmp/tmp.trimm_summary \
-quiet \
......@@ -72,7 +77,7 @@ do
echo "Working on file " $shortname;
java -jar /usr/local/src/Trimmomatic-0.38/trimmomatic-0.38.jar \
PE \
-threads 35 \
-threads $n_threads \
-phred33 \
-summary /tmp/tmp.trimm_summary \
-quiet \
......@@ -100,7 +105,7 @@ do
echo "Working on file " $shortname;
java -jar /usr/local/src/Trimmomatic-0.38/trimmomatic-0.38.jar \
PE \
-threads 35 \
-threads $n_threads \
-phred33 \
-summary /tmp/tmp.trimm_summary \
-quiet \
......
......@@ -2,7 +2,8 @@
## DEFINE PATH
path=/home/enrique/work/Gandon/coevolution/phages/
path=$1
# path=/home/enrique/work/Gandon/coevolution/phages/
## CREATE INDEXES AND
......@@ -13,32 +14,34 @@ path=/home/enrique/work/Gandon/coevolution/phages/
# bowtie2 --phred33 -5 12 -p 35 -t -x ${path}data/refs/indexes_Sv/Sv -1 ${path}data/trimmed/W_seq/W4T3_S54_R1.fq.gz -2 ${path}data/trimmed/W_seq/W4T3_S54_R2.fq.gz -S ${path}results/test.sam
path_fasta=/home/enrique/work/Gandon/coevolution/phages/data/trimmed/
path_fasta=${path}data/trimmed/
path_results=/home/enrique/work/Gandon/coevolution/phages/results/
path_results=${path}results/
bacteria_index=/home/enrique/work/Gandon/coevolution/phages/data/refs/indexes_St/St
bacteria_index=${path}data/refs/indexes_St/St
virus_index=/home/enrique/work/Gandon/coevolution/phages/data/refs/indexes_Sv/Sv
virus_index=${path}data/refs/indexes_Sv/Sv
for i in $(find $path_fasta -name *_R1.fq.gz)
for i in $(find $path_fasta -name *_R1.fq.gz)
do
## Declare local variables
# echo $i
root_name=$(basename -s _R1.fq.gz $i)
root_name=$(basename -s _R1.fq.gz $i)
var=$(dirname $i)
outdir=${var/data\/trimmed/results/mapping}/
echo -e "\n"phage $root_name -\> ${outdir}${root_name}.sam
echo $i ${i/_R1/_R2}
## Give some feedback to the user
echo -e "\n"phage $root_name -\> ${outdir}${root_name}.sam
echo $i ${i/_R1/_R2}
echo $virus_index
## Mapping and indexing bam file
echo "#### MAPPING"
bowtie2 --phred33 -5 12 -p 24 -t -x $virus_index -1 $i -2 ${i/_R1/_R2} -S ${outdir}${root_name}.sam
bowtie2 --phred33 -5 12 -p 24 -t -x $virus_index -1 $i -2 ${i/_R1/_R2} -S ${outdir}${root_name}.sam
echo "#### SORTING"
samtools sort -O BAM -o ${outdir}${root_name}.sort.bam ${outdir}${root_name}.sam
samtools index -b ${outdir}${root_name}.sort.bam
samtools sort -O BAM -o ${outdir}${root_name}.sort.bam ${outdir}${root_name}.sam
samtools index -b ${outdir}${root_name}.sort.bam
done
......@@ -3,7 +3,7 @@
## SNPCALLING
wd=$1 ## Working directory's path
ref=$2 ## path to rference.fasta
ref=$2 ## path to reference.fasta
od=$3 ## Output directory
......
File mode changed from 100755 to 100644
## Contents of scripts folder
The scripts are numbered in the order they are run.
If in doubt use the scripts which are executable.
A more detailed description of the files is below the list (use ctrl + f)
Files:
* 00_create_py_env.sh
* 01_quality_check.sh*
* 02_trimm_and_clean.sh*
* 03_mapping.sh*
* 04_snpcalling.sh*
* 05b_convert_protospacer_dico2fasta.py*
* 06b_blast_protospaces.sh*
* 07_2_run_vcf_parser_all_files.py
* 07_2_test.py
* 07_run_vcf_parser_all_files.py*
* procedure.sh
* README.md
* requirements_py-env.txt
* vcf_parser3.py
Folders:
* debug
* lib
* \_\_pycache\_\_
----
## Coding practices
In python I tried to use as much as possible the Python Enhancement Proposal 8 (PEP-8). https://www.python.org/dev/peps/pep-0008/
A difference I use regularly is using double `##` at the begining of a line containing *informative comments*.
During the developement stages I comment some code lines that would be uncommented as a block. Having two '#' prevents comments to be executed as code.
Example:
```python
## This block of code calculate the proportion
for i in input_list:
# print("proportion of the list")
print(i / sum(list))
```
Concerning **bash** coding I use often double spaces to separate commands, parameters, and arguments. When using some long names it makes things more readable
Big chunks of code are commented with capitals and short phrases,
whereas longer phrases in comments are in lower case.
Example:
```bash
## BIG CODE CHUNK
for i in $(find $path_fasta -name *_R1.fq.gz)
do
## Declare local variables
# echo $i
root_name=$(basename -s _R1.fq.gz $i)
var=$(dirname $i)
outdir=${var/data\/trimmed/results/mapping}/
## Give some feedback to the user
echo -e "\n"phage $root_name -\> ${outdir}${root_name}.sam
echo $i ${i/_R1/_R2}
echo $virus_index
## Mapping and indexing bam file
echo "#### MAPPING"
bowtie2 --phred33 -5 12 -p 24 -t -x $virus_index -1 $i -2 ${i/_R1/_R2} -S ${outdir}${root_name}.sam
echo "#### SORTING"
samtools sort -O BAM -o ${outdir}${root_name}.sort.bam ${outdir}${root_name}.sam
samtools index -b ${outdir}${root_name}.sort.bam
done
```
Sometimes I'll put the arguments in different lines having an indentation:
```bash
## A command with multiple parameters seaprated per line
samtools sort \
-O BAM \
-o ${outdir}${root_name}.sort.bam \
${outdir}${root_name}.sam
## Short commands in one single line
samtools index -b ${outdir}${root_name}.sort.bam
```
----
## File descriptions
### 00_create_py_env.sh
Creates a python virtual environment using `virtualenv`, the default python3 version of the system and will storte the environment in `~/envs/coev`. The installation of packages is done through pip.
### 01_quality_check.sh*
It will use FastQC to create quality control reports and then use multiqc to assemble the reports in only file. To make things easier, the input files are separated in 3 groups R, W and Other. These groups come from different treatments.
This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages/`. **Don't forget that final stroke**
### 02_trimm_and_clean.sh*
Launches Trimmomatic to clean data.
The parameters are embeded in the code -- for now
This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages/`. **Don't forget that final stroke**
### 03_mapping.sh*
The index creation is commented in the top of the script. It's only required once.
The path to the input files is a full path using a variable.
The mapper is bowtie2, after mapping the sam is sorted and converted to a bam and indexed so it's ready for the next stage.
This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages/`. **Don't forget that final stroke**
### 04_snpcalling.sh*
It uses freebayes to make the snp calling
This script takes three arguments:
1 The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages`. **Don't forget that final stroke**
2 Path to the reference
3 Path to the output directory
### 05b_convert_protospacer_dico2fasta.py*
The *protospacer_dico* is a raw text file with only one sequence per line.
Each line is a protospacer sequence that has been manually selected and curated.
This script converts these sequences to fasta format.
The name given to each sequence is the line number.
No arguments. the path to the file is hard written -- To be corrected
### 06b_blast_protospaces.sh*
Protospacers come from bacteria, we need to know where do they map on the virus' genome.
We tried out different parameters, this script is more a reminder of the commands and parameters tested. -- To be corrected
## 07_2_run_vcf_parser_all_files.py
This is the current script being used, the previous are old versions, before starting a git.
* 07_2_test.py : This is for debuging. The inputs and outputs are in the folder `scripts/debug`
* 07_run_vcf_parser_all_files.py*
This is the analysis script.
It describes the experimental planes to comapare data and make the graphics that were requested: binary and frequency heatmaps.
It requires the file `vcf_parser3.py` to be located in the same directory.
The analysis is defined in a function in the top of the script.
To run it I use ipython:
```bash
source ~/envs/coev/bin/activate
pip install ipython
```
To run from ipython:
```
%run 07_2_run_vcf_parser_all_files.py
```
----
## The other scripts
### procedure.sh
The commands used to launch the scripts up here as well as the supplementary commands to separate the different data, extract and all other action is written here.
It contains a pre-treatment of data to create sub-groups using symbolic links.
### README.md
This file :-P
### requirements_py-env.txt
File used by the script `00_create_py_env.sh`
It contains the list of libraries and versions used in python for this analysis.
### vcf_parser3.py
This is a python library which is imported by `07_2_run_vcf_parser_all_files.py`
It contains allows to import data from vcf files and put it in a pandas dataframe format to ease the analyses.
It works with objects: The _attributes_ contain information and the _methods_ are internal functions for that object. For example "remove control SNPs"
The last class allows to create a heatmap with matplotlib.
A more extended documentation will be made in the future
----
## Folders
### debug
Folder to put inputs and outputs for debuging.
### lib
Folder containing the library of some tests,
toy examples and commands to be called from the other scripts
### * \_\_pycache\_\_
Dispensable.
Created by python automatically when importing a `*.py`
#! /bin/bash
## SET THE PATH TO THE PHAGES DIRECTORY
# cd /home/user/work/coev/phages
## PREPARE DATA SUBGROUPS
## Uncompress raw data into data folder
tar -xzvf data/sequences.tar.gz -C data
## Change premissions to avoid accidents
chmod -w data/sequences/*
chmod -w data/sequences.tar.gz
## Make links to data to make sub-groups
## It makes it easier to handle groups of files
## This step is also done on scripts/01_quality_check.sh
mkdir -p data/fastq_ln/{R_seq,W_seq,Other_seq}
for i in $PWD/data/sequences/R*
do
ln -s $i $PWD/data/fastq_ln/R_seq
done
for i in $PWD/data/sequences/W*
do
ln -s $i $PWD/data/fastq_ln/W_seq
done
for i in $(ls raw_data/sequences/ | grep -v -E "^W.*.fastq.gz|^R.*.fastq.gz")
do
ln -s $PWD/raw_data/sequences/$i $PWD/data/fastq_ln/Other_seq
done
###################################
## CREATE PYTHON ENVIRONMENT -- With virtualenv
./scripts/00_create_py_env.sh
## environment located in:
## ~/envs/coev/
###################################
## QUALITY CHECK
./scripts/01_quality_check.sh $PWD/ qc_raw_data
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment