From 5c73fc0b56bc0ee69e64b5078cedf80caa6a21eb Mon Sep 17 00:00:00 2001 From: eortega Date: Fri, 10 Jan 2020 12:04:37 +0100 Subject: [PATCH 1/9] Added scripts/README.txt --- phages/scripts/README.txt | 32 ++++++++++++++++++++++++++++++++ phages/scripts/procedure.sh | 1 + 2 files changed, 33 insertions(+) create mode 100644 phages/scripts/README.txt create mode 100644 phages/scripts/procedure.sh diff --git a/phages/scripts/README.txt b/phages/scripts/README.txt new file mode 100644 index 0000000..415f3b8 --- /dev/null +++ b/phages/scripts/README.txt @@ -0,0 +1,32 @@ +## Contents of scripts folder + +The scripts are numbered in the order they are run. +If in doubt use the scripts which are executable. + +The description of the files is here below + +Files: + +* 00_create_py_env.sh +* 01_quality_check.sh* +* 02_trimm_and_clean.sh* +* 03_mapping.sh* +* 04_snpcalling.sh* +* 05b_convert_protospacer_dico2fasta.py* +* 06b_blast_protospaces.sh* +* 07_2_run_vcf_parser_all_files.py +* 07_2_test.py +* 07_run_vcf_parser_all_files.py* + +* procedure.sh +* README.txt +* requirements_py-env.txt + +* vcf_parser3.py + +Folders: + + +Files to ignore for a while ^_^ + +### \ No newline at end of file diff --git a/phages/scripts/procedure.sh b/phages/scripts/procedure.sh new file mode 100644 index 0000000..01dca2d --- /dev/null +++ b/phages/scripts/procedure.sh @@ -0,0 +1 @@ +/bin/bash -- GitLab From 6003a162c8c96ec7bcc4f6744607434db0836362 Mon Sep 17 00:00:00 2001 From: eortega Date: Fri, 10 Jan 2020 14:01:55 +0100 Subject: [PATCH 2/9] Coding stye and list of files added to scripts/README.md. README.txt is now README.md --- phages/scripts/03_mapping.sh | 14 +++--- phages/scripts/README.md | 89 ++++++++++++++++++++++++++++++++++++ phages/scripts/README.txt | 63 +++++++++++++++++++++++-- 3 files changed, 156 insertions(+), 10 deletions(-) create mode 100644 phages/scripts/README.md diff --git a/phages/scripts/03_mapping.sh b/phages/scripts/03_mapping.sh index 8b78f71..eb24a81 100755 --- a/phages/scripts/03_mapping.sh +++ b/phages/scripts/03_mapping.sh @@ -23,22 +23,22 @@ virus_index=/home/enrique/work/Gandon/coevolution/phages/data/refs/indexes_Sv/Sv -for i in $(find $path_fasta -name *_R1.fq.gz) +for i in $(find $path_fasta -name *_R1.fq.gz) do # echo $i - root_name=$(basename -s _R1.fq.gz $i) + root_name=$(basename -s _R1.fq.gz $i) var=$(dirname $i) outdir=${var/data\/trimmed/results/mapping}/ - echo -e "\n"phage $root_name -\> ${outdir}${root_name}.sam - echo $i ${i/_R1/_R2} + echo -e "\n"phage $root_name -\> ${outdir}${root_name}.sam + echo $i ${i/_R1/_R2} echo $virus_index echo "#### MAPPING" - bowtie2 --phred33 -5 12 -p 24 -t -x $virus_index -1 $i -2 ${i/_R1/_R2} -S ${outdir}${root_name}.sam + bowtie2 --phred33 -5 12 -p 24 -t -x $virus_index -1 $i -2 ${i/_R1/_R2} -S ${outdir}${root_name}.sam echo "#### SORTING" - samtools sort -O BAM -o ${outdir}${root_name}.sort.bam ${outdir}${root_name}.sam - samtools index -b ${outdir}${root_name}.sort.bam + samtools sort -O BAM -o ${outdir}${root_name}.sort.bam ${outdir}${root_name}.sam + samtools index -b ${outdir}${root_name}.sort.bam done diff --git a/phages/scripts/README.md b/phages/scripts/README.md new file mode 100644 index 0000000..ca86828 --- /dev/null +++ b/phages/scripts/README.md @@ -0,0 +1,89 @@ +## Contents of scripts folder + +The scripts are numbered in the order they are run. +If in doubt use the scripts which are executable. + +A more detailed description of the files is below the list (use ctrl + f) + +Files: + +* 00_create_py_env.sh +* 01_quality_check.sh* +* 02_trimm_and_clean.sh* +* 03_mapping.sh* +* 04_snpcalling.sh* +* 05b_convert_protospacer_dico2fasta.py* +* 06b_blast_protospaces.sh* +* 07_2_run_vcf_parser_all_files.py +* 07_2_test.py +* 07_run_vcf_parser_all_files.py* + +* procedure.sh +* README.txt +* requirements_py-env.txt + +* vcf_parser3.py + +Folders: + +* debug +* lib +* __pycahce__ + + + +## Coding practices + +I tried to use as much as possible the Python Enhancement Proposal 8 (PEP-8). https://www.python.org/dev/peps/pep-0008/ + +A difference I use regularl is using double `##` at the begining of a line containing comments. +During the developement stages I comment some code lines that would be uncommented as a block. Having two '#' signs un real comments allows not to mistake them for command lines. + +Example: + +```python +## This block of code calculate the proportion +for i in input_list: + # print("proportion of the list") + print(i / sum(list)) +``` + +Concerning bash coding I use often double spaces to separate commands, parameters, and arguments. When using some long names it makes things more readable + +Example: + +```bash +for i in $(find $path_fasta -name *_R1.fq.gz) +do + # echo $i + root_name=$(basename -s _R1.fq.gz $i) + var=$(dirname $i) + outdir=${var/data\/trimmed/results/mapping}/ + + echo -e "\n"phage $root_name -\> ${outdir}${root_name}.sam + echo $i ${i/_R1/_R2} + echo $virus_index + + echo "#### MAPPING" + bowtie2 --phred33 -5 12 -p 24 -t -x $virus_index -1 $i -2 ${i/_R1/_R2} -S ${outdir}${root_name}.sam + + echo "#### SORTING" + samtools sort -O BAM -o ${outdir}${root_name}.sort.bam ${outdir}${root_name}.sam + samtools index -b ${outdir}${root_name}.sort.bam + +done +``` + +Sometimes I'll put the arguments in different lines having an indentation: + +```bash +## A command with multiple parameters seaprated per line +samtools sort \ + -O BAM \ + -o ${outdir}${root_name}.sort.bam \ + ${outdir}${root_name}.sam + +## Short commands in one single line +samtools index -b ${outdir}${root_name}.sort.bam +``` + diff --git a/phages/scripts/README.txt b/phages/scripts/README.txt index 415f3b8..ca86828 100644 --- a/phages/scripts/README.txt +++ b/phages/scripts/README.txt @@ -3,7 +3,7 @@ The scripts are numbered in the order they are run. If in doubt use the scripts which are executable. -The description of the files is here below +A more detailed description of the files is below the list (use ctrl + f) Files: @@ -26,7 +26,64 @@ Files: Folders: +* debug +* lib +* __pycahce__ -Files to ignore for a while ^_^ -### \ No newline at end of file + +## Coding practices + +I tried to use as much as possible the Python Enhancement Proposal 8 (PEP-8). https://www.python.org/dev/peps/pep-0008/ + +A difference I use regularl is using double `##` at the begining of a line containing comments. +During the developement stages I comment some code lines that would be uncommented as a block. Having two '#' signs un real comments allows not to mistake them for command lines. + +Example: + +```python +## This block of code calculate the proportion +for i in input_list: + # print("proportion of the list") + print(i / sum(list)) +``` + +Concerning bash coding I use often double spaces to separate commands, parameters, and arguments. When using some long names it makes things more readable + +Example: + +```bash +for i in $(find $path_fasta -name *_R1.fq.gz) +do + # echo $i + root_name=$(basename -s _R1.fq.gz $i) + var=$(dirname $i) + outdir=${var/data\/trimmed/results/mapping}/ + + echo -e "\n"phage $root_name -\> ${outdir}${root_name}.sam + echo $i ${i/_R1/_R2} + echo $virus_index + + echo "#### MAPPING" + bowtie2 --phred33 -5 12 -p 24 -t -x $virus_index -1 $i -2 ${i/_R1/_R2} -S ${outdir}${root_name}.sam + + echo "#### SORTING" + samtools sort -O BAM -o ${outdir}${root_name}.sort.bam ${outdir}${root_name}.sam + samtools index -b ${outdir}${root_name}.sort.bam + +done +``` + +Sometimes I'll put the arguments in different lines having an indentation: + +```bash +## A command with multiple parameters seaprated per line +samtools sort \ + -O BAM \ + -o ${outdir}${root_name}.sort.bam \ + ${outdir}${root_name}.sam + +## Short commands in one single line +samtools index -b ${outdir}${root_name}.sort.bam +``` + -- GitLab From bfc05ce5a97b5b92c5ce1a5e4426b4181daaa594 Mon Sep 17 00:00:00 2001 From: eortega Date: Fri, 10 Jan 2020 14:35:20 +0100 Subject: [PATCH 3/9] Corrected 02 and 03's input argument corrected. Commented 3 scripts in the readme. --- phages/scripts/02_trimm_and_clean.sh | 13 +++++-- phages/scripts/03_mapping.sh | 15 ++++--- phages/scripts/README.md | 58 +++++++++++++++++++++++++--- 3 files changed, 71 insertions(+), 15 deletions(-) diff --git a/phages/scripts/02_trimm_and_clean.sh b/phages/scripts/02_trimm_and_clean.sh index 9bfb71d..d41f470 100755 --- a/phages/scripts/02_trimm_and_clean.sh +++ b/phages/scripts/02_trimm_and_clean.sh @@ -1,9 +1,14 @@ #! /bin/bash +## VARIABLES -path=/home/enrique/work/Gandon/coevolution/phages/ +#path=/home/enrique/work/Gandon/coevolution/phages/ +path=$1 +n_threads=35 +## SCRIPT + mkdir -p data/trimmed/{W_seq,R_seq,Other_seq} trimm_summary=${path}data/summary_trimm @@ -45,7 +50,7 @@ do echo "Working on file " $shortname; java -jar /usr/local/src/Trimmomatic-0.38/trimmomatic-0.38.jar \ PE \ - -threads 35 \ + -threads $n_threads \ -phred33 \ -summary /tmp/tmp.trimm_summary \ -quiet \ @@ -72,7 +77,7 @@ do echo "Working on file " $shortname; java -jar /usr/local/src/Trimmomatic-0.38/trimmomatic-0.38.jar \ PE \ - -threads 35 \ + -threads $n_threads \ -phred33 \ -summary /tmp/tmp.trimm_summary \ -quiet \ @@ -100,7 +105,7 @@ do echo "Working on file " $shortname; java -jar /usr/local/src/Trimmomatic-0.38/trimmomatic-0.38.jar \ PE \ - -threads 35 \ + -threads $n_threads \ -phred33 \ -summary /tmp/tmp.trimm_summary \ -quiet \ diff --git a/phages/scripts/03_mapping.sh b/phages/scripts/03_mapping.sh index eb24a81..d32775e 100755 --- a/phages/scripts/03_mapping.sh +++ b/phages/scripts/03_mapping.sh @@ -2,7 +2,8 @@ ## DEFINE PATH -path=/home/enrique/work/Gandon/coevolution/phages/ +path=$1 +# path=/home/enrique/work/Gandon/coevolution/phages/ ## CREATE INDEXES AND @@ -13,27 +14,29 @@ path=/home/enrique/work/Gandon/coevolution/phages/ # bowtie2 --phred33 -5 12 -p 35 -t -x ${path}data/refs/indexes_Sv/Sv -1 ${path}data/trimmed/W_seq/W4T3_S54_R1.fq.gz -2 ${path}data/trimmed/W_seq/W4T3_S54_R2.fq.gz -S ${path}results/test.sam -path_fasta=/home/enrique/work/Gandon/coevolution/phages/data/trimmed/ +path_fasta=${path}data/trimmed/ -path_results=/home/enrique/work/Gandon/coevolution/phages/results/ +path_results=${path}results/ -bacteria_index=/home/enrique/work/Gandon/coevolution/phages/data/refs/indexes_St/St - -virus_index=/home/enrique/work/Gandon/coevolution/phages/data/refs/indexes_Sv/Sv +bacteria_index=${path}data/refs/indexes_St/St +virus_index=${path}data/refs/indexes_Sv/Sv for i in $(find $path_fasta -name *_R1.fq.gz) do + ## Declare local variables # echo $i root_name=$(basename -s _R1.fq.gz $i) var=$(dirname $i) outdir=${var/data\/trimmed/results/mapping}/ + ## Give some feedback to the user echo -e "\n"phage $root_name -\> ${outdir}${root_name}.sam echo $i ${i/_R1/_R2} echo $virus_index + ## Mapping and indexing bam file echo "#### MAPPING" bowtie2 --phred33 -5 12 -p 24 -t -x $virus_index -1 $i -2 ${i/_R1/_R2} -S ${outdir}${root_name}.sam diff --git a/phages/scripts/README.md b/phages/scripts/README.md index ca86828..361a161 100644 --- a/phages/scripts/README.md +++ b/phages/scripts/README.md @@ -30,14 +30,14 @@ Folders: * lib * __pycahce__ - +---- ## Coding practices -I tried to use as much as possible the Python Enhancement Proposal 8 (PEP-8). https://www.python.org/dev/peps/pep-0008/ +In python I tried to use as much as possible the Python Enhancement Proposal 8 (PEP-8). https://www.python.org/dev/peps/pep-0008/ -A difference I use regularl is using double `##` at the begining of a line containing comments. -During the developement stages I comment some code lines that would be uncommented as a block. Having two '#' signs un real comments allows not to mistake them for command lines. +A difference I use regularly is using double `##` at the begining of a line containing *informative comments*. +During the developement stages I comment some code lines that would be uncommented as a block. Having two '#' prevents comments to be executed as code. Example: @@ -48,22 +48,30 @@ for i in input_list: print(i / sum(list)) ``` -Concerning bash coding I use often double spaces to separate commands, parameters, and arguments. When using some long names it makes things more readable +Concerning **bash** coding I use often double spaces to separate commands, parameters, and arguments. When using some long names it makes things more readable + +Big chunks of code are commented with capitals and short phrases, +whereas longer phrases in comments are in lower case. Example: ```bash +## BIG CODE CHUNK + for i in $(find $path_fasta -name *_R1.fq.gz) do + ## Declare local variables # echo $i root_name=$(basename -s _R1.fq.gz $i) var=$(dirname $i) outdir=${var/data\/trimmed/results/mapping}/ + ## Give some feedback to the user echo -e "\n"phage $root_name -\> ${outdir}${root_name}.sam echo $i ${i/_R1/_R2} echo $virus_index + ## Mapping and indexing bam file echo "#### MAPPING" bowtie2 --phred33 -5 12 -p 24 -t -x $virus_index -1 $i -2 ${i/_R1/_R2} -S ${outdir}${root_name}.sam @@ -87,3 +95,43 @@ samtools sort \ samtools index -b ${outdir}${root_name}.sort.bam ``` +---- + +## File descriptions + + +### 00_create_py_env.sh + +Creates a python virtual environment using `virtualenv`, the default python3 version of the system and will storte the environment in `~/envs/coev`. The installation of packages is done through pip. + + +### 01_quality_check.sh* + +It will use FastQC to create quality control reports and then use multiqc to assemble the reports in only file. To make things easier, the input files are separated in 3 groups R, W and Other. These groups come from different treatments. + +This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages` + + +### 02_trimm_and_clean.sh* + +Launches Trimmomatic to clean data. +The parameters are embeded in the code -- for now + +This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages` + + +### 03_mapping.sh* + +The index creation is commented in the top of the script. It's only required once. +The path to the input files is a full path using a variable. +The mapper is bowtie2, after mapping the sam is sorted and converted to a bam and indexed so it's ready for the next stage. + +This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages` + + +* 04_snpcalling.sh* +* 05b_convert_protospacer_dico2fasta.py* +* 06b_blast_protospaces.sh* +* 07_2_run_vcf_parser_all_files.py +* 07_2_test.py +* 07_run_vcf_parser_all_files.py* \ No newline at end of file -- GitLab From 4987e4969540c0afe71b03e6feffbdd8474f91e6 Mon Sep 17 00:00:00 2001 From: eortega Date: Fri, 10 Jan 2020 14:45:40 +0100 Subject: [PATCH 4/9] deleted README.txt --- phages/scripts/README.txt | 89 --------------------------------------- 1 file changed, 89 deletions(-) delete mode 100644 phages/scripts/README.txt diff --git a/phages/scripts/README.txt b/phages/scripts/README.txt deleted file mode 100644 index ca86828..0000000 --- a/phages/scripts/README.txt +++ /dev/null @@ -1,89 +0,0 @@ -## Contents of scripts folder - -The scripts are numbered in the order they are run. -If in doubt use the scripts which are executable. - -A more detailed description of the files is below the list (use ctrl + f) - -Files: - -* 00_create_py_env.sh -* 01_quality_check.sh* -* 02_trimm_and_clean.sh* -* 03_mapping.sh* -* 04_snpcalling.sh* -* 05b_convert_protospacer_dico2fasta.py* -* 06b_blast_protospaces.sh* -* 07_2_run_vcf_parser_all_files.py -* 07_2_test.py -* 07_run_vcf_parser_all_files.py* - -* procedure.sh -* README.txt -* requirements_py-env.txt - -* vcf_parser3.py - -Folders: - -* debug -* lib -* __pycahce__ - - - -## Coding practices - -I tried to use as much as possible the Python Enhancement Proposal 8 (PEP-8). https://www.python.org/dev/peps/pep-0008/ - -A difference I use regularl is using double `##` at the begining of a line containing comments. -During the developement stages I comment some code lines that would be uncommented as a block. Having two '#' signs un real comments allows not to mistake them for command lines. - -Example: - -```python -## This block of code calculate the proportion -for i in input_list: - # print("proportion of the list") - print(i / sum(list)) -``` - -Concerning bash coding I use often double spaces to separate commands, parameters, and arguments. When using some long names it makes things more readable - -Example: - -```bash -for i in $(find $path_fasta -name *_R1.fq.gz) -do - # echo $i - root_name=$(basename -s _R1.fq.gz $i) - var=$(dirname $i) - outdir=${var/data\/trimmed/results/mapping}/ - - echo -e "\n"phage $root_name -\> ${outdir}${root_name}.sam - echo $i ${i/_R1/_R2} - echo $virus_index - - echo "#### MAPPING" - bowtie2 --phred33 -5 12 -p 24 -t -x $virus_index -1 $i -2 ${i/_R1/_R2} -S ${outdir}${root_name}.sam - - echo "#### SORTING" - samtools sort -O BAM -o ${outdir}${root_name}.sort.bam ${outdir}${root_name}.sam - samtools index -b ${outdir}${root_name}.sort.bam - -done -``` - -Sometimes I'll put the arguments in different lines having an indentation: - -```bash -## A command with multiple parameters seaprated per line -samtools sort \ - -O BAM \ - -o ${outdir}${root_name}.sort.bam \ - ${outdir}${root_name}.sam - -## Short commands in one single line -samtools index -b ${outdir}${root_name}.sort.bam -``` - -- GitLab From fa2f815aa5aa51f593d026ea555374680181e3d8 Mon Sep 17 00:00:00 2001 From: eortega Date: Fri, 10 Jan 2020 14:56:41 +0100 Subject: [PATCH 5/9] Corrected hard written paths on script 04 and updated README.md --- phages/scripts/04_snpcalling.sh | 2 +- phages/scripts/README.md | 32 ++++++++++++++++++++++++++------ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/phages/scripts/04_snpcalling.sh b/phages/scripts/04_snpcalling.sh index fe9a121..470e1d9 100755 --- a/phages/scripts/04_snpcalling.sh +++ b/phages/scripts/04_snpcalling.sh @@ -3,7 +3,7 @@ ## SNPCALLING wd=$1 ## Working directory's path -ref=$2 ## path to rference.fasta +ref=$2 ## path to reference.fasta od=$3 ## Output directory diff --git a/phages/scripts/README.md b/phages/scripts/README.md index 361a161..5b5be7d 100644 --- a/phages/scripts/README.md +++ b/phages/scripts/README.md @@ -109,7 +109,7 @@ Creates a python virtual environment using `virtualenv`, the default python3 ver It will use FastQC to create quality control reports and then use multiqc to assemble the reports in only file. To make things easier, the input files are separated in 3 groups R, W and Other. These groups come from different treatments. -This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages` +This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages/`. **Don't forget that final stroke** ### 02_trimm_and_clean.sh* @@ -117,7 +117,7 @@ This script takes one argument: The path to the working directory, which is the Launches Trimmomatic to clean data. The parameters are embeded in the code -- for now -This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages` +This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages/`. **Don't forget that final stroke** ### 03_mapping.sh* @@ -126,12 +126,32 @@ The index creation is commented in the top of the script. It's only required onc The path to the input files is a full path using a variable. The mapper is bowtie2, after mapping the sam is sorted and converted to a bam and indexed so it's ready for the next stage. -This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages` +This script takes one argument: The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages/`. **Don't forget that final stroke** + + +### 04_snpcalling.sh* + +It uses freebayes to make the snp calling + +This script takes three arguments: +1 The path to the working directory, which is the project directory: `/home/user/work/coevolution/phages`. **Don't forget that final stroke** +2 Path to the reference +3 Path to the output directory + + +### 05b_convert_protospacer_dico2fasta.py* + +The *protospacer_dico* is a raw text file with only one sequence per line. +Each line is a protospacer sequence that has been manually selected and curated. +This script converts these sequences to fasta format. +The name given to each sequence is the line number. + +No arguments. the path to the file is hard written -- To be corrected + + +### 06b_blast_protospaces.sh* -* 04_snpcalling.sh* -* 05b_convert_protospacer_dico2fasta.py* -* 06b_blast_protospaces.sh* * 07_2_run_vcf_parser_all_files.py * 07_2_test.py * 07_run_vcf_parser_all_files.py* \ No newline at end of file -- GitLab From 49794d53a1500331f018d128e0efcfa1e96dd911 Mon Sep 17 00:00:00 2001 From: eortega Date: Fri, 10 Jan 2020 16:08:32 +0100 Subject: [PATCH 6/9] scripts/README.md all scripts have been described --- phages/scripts/06b_blast_protospaces.sh | 0 phages/scripts/README.md | 31 ++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) mode change 100755 => 100644 phages/scripts/06b_blast_protospaces.sh diff --git a/phages/scripts/06b_blast_protospaces.sh b/phages/scripts/06b_blast_protospaces.sh old mode 100755 new mode 100644 diff --git a/phages/scripts/README.md b/phages/scripts/README.md index 5b5be7d..55977bc 100644 --- a/phages/scripts/README.md +++ b/phages/scripts/README.md @@ -151,7 +151,32 @@ No arguments. the path to the file is hard written -- To be corrected ### 06b_blast_protospaces.sh* +Protospacers come from bacteria, we need to know where do they map on the virus' genome. -* 07_2_run_vcf_parser_all_files.py -* 07_2_test.py -* 07_run_vcf_parser_all_files.py* \ No newline at end of file +We tried out different parameters, this script is more a reminder of the commands and parameters tested. -- To be corrected + + +## 07_2_run_vcf_parser_all_files.py + +This is the current script being used, the previous are old versions, before starting a git. + +* 07_2_test.py : +* 07_run_vcf_parser_all_files.py* + +This is the analysis script. +It describes the experimental planes to comapare data and make the graphics that were requested: binary and frequency heatmaps. +It requires the file `vcf_parser3.py` to be located in the same directory. + +The analysis is defined in a function in the top of the script. + +To run it I use ipython: + +```bash +source ~/envs/coev/bin/activate +pip install ipython +``` + +To run from ipython: +``` +%run 07_2_run_vcf_parser_all_files.py +``` -- GitLab From 0b4d4c6cb377304ec3a24e814d43d6abbc9f42eb Mon Sep 17 00:00:00 2001 From: eortega Date: Fri, 10 Jan 2020 16:23:28 +0100 Subject: [PATCH 7/9] Last description of reminding files. --- phages/scripts/README.md | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/phages/scripts/README.md b/phages/scripts/README.md index 55977bc..9b2639d 100644 --- a/phages/scripts/README.md +++ b/phages/scripts/README.md @@ -19,7 +19,7 @@ Files: * 07_run_vcf_parser_all_files.py* * procedure.sh -* README.txt +* README.md * requirements_py-env.txt * vcf_parser3.py @@ -160,7 +160,7 @@ We tried out different parameters, this script is more a reminder of the command This is the current script being used, the previous are old versions, before starting a git. -* 07_2_test.py : +* 07_2_test.py : This is for debuging. The inputs and outputs are in the folder `scripts/debug` * 07_run_vcf_parser_all_files.py* This is the analysis script. @@ -180,3 +180,38 @@ To run from ipython: ``` %run 07_2_run_vcf_parser_all_files.py ``` + +---- + + + +## The other scripts + +### procedure.sh + +The commands used to launch the scripts up here as well as the supplementary commands to separate the different data, extract and all other action is written here. + + +### README.md + +This file :-P + + +### requirements_py-env.txt + +File used by the script `00_create_py_env.sh` +It contains the list of libraries and versions used in python for this analysis. + + +### vcf_parser3.py + +This is a python library which is imported by `07_2_run_vcf_parser_all_files.py` + +It contains allows to import data from vcf files and put it in a pandas dataframe format to ease the analyses. + +It works with objects: The _attributes_ contain information and the _methods_ are internal functions for that object. For example "remove control SNPs" + +The last class allows to create a heatmap with matplotlib. + +A more extended documentation will be made in the future + -- GitLab From c4017642b9db2906587bec406210efa4a85412a3 Mon Sep 17 00:00:00 2001 From: eortega Date: Mon, 13 Jan 2020 21:07:01 +0100 Subject: [PATCH 8/9] Separate raw data in groups and updated README.md accordingly --- phages/scripts/README.md | 22 ++++++++++++++++++- phages/scripts/procedure.sh | 44 ++++++++++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/phages/scripts/README.md b/phages/scripts/README.md index 9b2639d..338b3fa 100644 --- a/phages/scripts/README.md +++ b/phages/scripts/README.md @@ -28,7 +28,7 @@ Folders: * debug * lib -* __pycahce__ +* \_\_pycache\_\_ ---- @@ -191,6 +191,8 @@ To run from ipython: The commands used to launch the scripts up here as well as the supplementary commands to separate the different data, extract and all other action is written here. +It contains a pre-treatment of data to create sub-groups using symbolic links + ### README.md @@ -215,3 +217,21 @@ The last class allows to create a heatmap with matplotlib. A more extended documentation will be made in the future + +---- + +## Folders + +### debug + +Folder to put inputs and outputs for debuging. + +### lib + +Folder containing the library of some tests, +toy examples and commands to be called from the other scripts + +### * \_\_pycache\_\_ + +Dispensable. +Created by python automatically when importing a `*.py` diff --git a/phages/scripts/procedure.sh b/phages/scripts/procedure.sh index 01dca2d..486952f 100644 --- a/phages/scripts/procedure.sh +++ b/phages/scripts/procedure.sh @@ -1 +1,43 @@ -/bin/bash +#! /bin/bash + + +## SET THE PATH TO THE PHAGES DIRECTORY +# cd /home/user/work/coev/phages + + +## PREPARE DATA + +## Uncompress raw data into data folder +tar -xzvf data/sequences.tar.gz -C data + +## Change premissions to avoid accidents +chmod -w data/sequences/* +chmod -w data/sequences.tar.gz + +## Make links to data to make sub-groups +## It makes it easier to handle groups of files + +mkdir -p data/fastq_ln/{R_seq,W_seq,Other_seq} + +for i in $PWD/data/sequences/R* +do + ln -s $i $PWD/data/fastq_ln/R_seq +done + + +for i in $PWD/data/sequences/W* +do + ln -s $i $PWD/data/fastq_ln/W_seq +done + + +for i in $(ls raw_data/sequences/ | grep -v -E "^W.*.fastq.gz|^R.*.fastq.gz") +do + ln -s $PWD/raw_data/sequences/$i $PWD/data/fastq_ln/Other_seq +done + + + +################################### + + -- GitLab From a0e243efce3e854a87bbe5cf29cf2679c12ffa16 Mon Sep 17 00:00:00 2001 From: eortega Date: Tue, 14 Jan 2020 17:48:53 +0100 Subject: [PATCH 9/9] scripts/procedure.sh updated and scripts/00_create_py_env.sh and 01_quality_check.sh updated to new architecture --- phages/scripts/00_create_py_env.sh | 2 +- phages/scripts/01_quality_check.sh | 34 ++++++++++++++++++------------ phages/scripts/README.md | 2 +- phages/scripts/procedure.sh | 12 ++++++++++- 4 files changed, 34 insertions(+), 16 deletions(-) mode change 100644 => 100755 phages/scripts/00_create_py_env.sh diff --git a/phages/scripts/00_create_py_env.sh b/phages/scripts/00_create_py_env.sh old mode 100644 new mode 100755 index 8612ac9..c5743f9 --- a/phages/scripts/00_create_py_env.sh +++ b/phages/scripts/00_create_py_env.sh @@ -27,7 +27,7 @@ source ${py_env_dir}/bin/activate pip install --upgrade pip ## -pip install -r requirements_py-env.txt +pip install -r scripts/requirements_py-env.txt # pip install biopython pandas matplotlib multiqc pyvcf diff --git a/phages/scripts/01_quality_check.sh b/phages/scripts/01_quality_check.sh index 74ba967..cf1fbf6 100755 --- a/phages/scripts/01_quality_check.sh +++ b/phages/scripts/01_quality_check.sh @@ -1,9 +1,14 @@ #!/bin/bash +## Launch example from coevolution/phages/ +## ./scripts/01_quality_check.sh $PWD/ qc_rawd_data + + ## PATH TO WORKING DIRECTORY path=$1 #path=/home/enrique/work/Gandon/coevolution/phages/ +step_name=$2 @@ -11,6 +16,9 @@ path=$1 source ~/envs/coev/bin/activate +## Some temporary files are created nearby the source files +## Using links from a tmp will avoid errors in case the directory +## containing the raw data is not writable ## SEPARATE THE FILES BY NAME @@ -20,47 +28,47 @@ W_seq=$(mktemp -d) R_seq=$(mktemp -d) Other_seq=$(mktemp -d) - +echo $W_seq $R_seq $Other_seq ### CREATE SYMBOLIC LINKS TO SEQUENCE FILES ### DIIVIDED IN 3 DIRECTORIES - echo "CREATE SYMBOLIC LINKS" -for i in $(ls ${path}raw_data/sequences/W*); +for i in $(ls ${path}data/sequences/W*); do ln -s $i $W_seq; done -for i in $(ls ${path}raw_data/sequences/R*); +for i in $(ls ${path}data/sequences/R*); do ln -s $i $R_seq; done -for i in $(ls ${path}raw_data/sequences/ | grep -v ^W | grep -v ^R); +for i in $(ls ${path}data/sequences/ | grep -v ^W | grep -v ^R); do #echo $i; ln -s ${path}raw_data/sequences/$i $Other_seq done +echo "CREATE SYMBOLIC LINKS -- DONE" ### MAKE RUN FASTQC ON EACH GROUPE -mkdir -p ${path}qual/fastqc/{W_seq,R_seq,Other_seq} +mkdir -p ${path}steps/${step_name}/fastqc/{W_seq,R_seq,Other_seq} -fastqc -t 35 --noextract -o ${path}qual/fastqc/W_seq $W_seq/* +fastqc -t 35 --noextract -o ${path}steps/$step_name/fastqc/W_seq $W_seq/* # multiqc -f -i W_seq -o ${path}qual/multiqc/ ${path}qual/fastqc/W_seq -fastqc -t 35 --noextract -o ${path}qual/fastqc/R_seq $R_seq/* +fastqc -t 35 --noextract -o ${path}steps/$step_name/fastqc/R_seq $R_seq/* # multiqc -f -i R_seq -o ${path}qual/multiqc/ ${path}qual/fastqc/R_seq/ -fastqc -t 35 --noextract -o ${path}qual/fastqc/Other_seq $Other_seq/* +fastqc -t 35 --noextract -o ${path}steps/$step_name/fastqc/Other_seq $Other_seq/* # multiqc -f -i Other_seq -o ${path}qual/multiqc/ ${path}qual/fastqc/Other_seq @@ -71,17 +79,17 @@ fastqc -t 35 --noextract -o ${path}qual/fastqc/Other_seq $Other_seq/* ## LOOP MULTIQC DEPENDING ON THE INPUTS ## MAKE DIRECTORIES FOR THE DIFFERENT SAMPLES: -mkdir -p ${path}qual/multiqc/{W,R,Other} +mkdir -p ${path}steps/$step_name/multiqc/{W_seq,R_seq,Other_seq} for i in $(seq 8) do - multiqc -f -i W${i}_seq -n W${i} -o ${path}qual/multiqc/W ${path}qual/fastqc/W_seq/W${i}* - multiqc -f -i R${i}_seq -n R${i} -o ${path}qual/multiqc/R ${path}qual/fastqc/R_seq/R${i}* + multiqc -f -i W${i}_seq -n W${i} -o ${path}steps/$step_name/multiqc/W_seq ${path}steps/$step_name/fastqc/W_seq/W${i}* + multiqc -f -i R${i}_seq -n R${i} -o ${path}steps/$step_name/multiqc/R_seq ${path}steps/$step_name/fastqc/R_seq/R${i}* done for i in 2972 A B C D CTRL T Undetermined do - multiqc -f -i ${i}_seq -n ${i} -o ${path}qual/multiqc/Other ${path}qual/fastqc/Other_seq/${i}* + multiqc -f -i ${i}_seq -n ${i} -o ${path}steps/$step_name/multiqc/Other_seq ${path}steps/$step_name/fastqc/Other_seq/${i}* done diff --git a/phages/scripts/README.md b/phages/scripts/README.md index 338b3fa..39441b2 100644 --- a/phages/scripts/README.md +++ b/phages/scripts/README.md @@ -191,7 +191,7 @@ To run from ipython: The commands used to launch the scripts up here as well as the supplementary commands to separate the different data, extract and all other action is written here. -It contains a pre-treatment of data to create sub-groups using symbolic links +It contains a pre-treatment of data to create sub-groups using symbolic links. ### README.md diff --git a/phages/scripts/procedure.sh b/phages/scripts/procedure.sh index 486952f..153f015 100644 --- a/phages/scripts/procedure.sh +++ b/phages/scripts/procedure.sh @@ -5,7 +5,7 @@ # cd /home/user/work/coev/phages -## PREPARE DATA +## PREPARE DATA SUBGROUPS ## Uncompress raw data into data folder tar -xzvf data/sequences.tar.gz -C data @@ -17,6 +17,8 @@ chmod -w data/sequences.tar.gz ## Make links to data to make sub-groups ## It makes it easier to handle groups of files + +## This step is also done on scripts/01_quality_check.sh mkdir -p data/fastq_ln/{R_seq,W_seq,Other_seq} for i in $PWD/data/sequences/R* @@ -39,5 +41,13 @@ done ################################### +## CREATE PYTHON ENVIRONMENT -- With virtualenv + +./scripts/00_create_py_env.sh +## environment located in: +## ~/envs/coev/ +################################### +## QUALITY CHECK +./scripts/01_quality_check.sh $PWD/ qc_raw_data -- GitLab