Commit 2731529c authored by peguerin's avatar peguerin
Browse files

init realease 1

parent 3c5fe8c4
__author__ = "Pierre-Edouard Guerin"
__license__ = "MIT"
### Paired end alignment then keep reads with quality > 40
rule illuminapairedend:
input:
R1=config["fichiers"]["folder_fastq"]+'{run}_R1.fastq.gz',
R2=config["fichiers"]["folder_fastq"]+'{run}_R2.fastq.gz',
output:
fq='01_illuminapairedend/{run}.fastq'
singularity:
config["singularity"]["obitools"]
log:
'../99_log/02_assembly/01_illuminapairedend/{run}.log'
params:
s_min=config["illuminapairedend"]["s_min"]
shell:
'''illuminapairedend -r {input.R2} {input.R1} --score-min={params.s_min} > {output.fq} 2> {log}'''
### Remove unaligned sequence records
rule remove_unaligned:
input:
fq='01_illuminapairedend/{run}.fastq'
output:
ali='02_remove_unaligned/{run}.ali.fastq'
singularity:
config["singularity"]["obitools"]
log:
'../99_log/02_assembly/02_remove_unaligned/{run}.log'
shell:
'''obigrep -p 'mode!=\"joined\"' {input.fq} > {output.ali} 2> {log}'''
### Assign each sequence to a taxon
rule assign_taxon:
input:
'02_dereplicated/{run}.uniq.fasta'
output:
'03_assigned/{run}.tag.u.fasta'
singularity:
config["singularity"]["obitools"]
params:
drun= lambda wildcards: dfpmr[dfpmr.projmarkrun == wildcards.run].to_dict('records')[0],
log:
'../99_log/05_assignment/03_assign_taxon/{run}.log'
shell:
'''ecotag -d {params.drun[bdr]} -R {params.drun[fasta]} {input} > {output} 2> {log}'''
### Assign each sequence record to the corresponding sample/marker combination
rule assign_sequences:
output:
assign='01_assign_sequences/{demultiplex}.ali.assigned.fastq',
unid='01_assign_sequences/{demultiplex}.unidentified.fastq'
singularity:
config["singularity"]["obitools"]
params:
dmulti= lambda wildcards: dfRunMarker[dfRunMarker.projMarkRun == wildcards.demultiplex].to_dict('records')[0],
log:
'../99_log/03_demultiplex/01_assign_sequences/{demultiplex}.log'
shell:
'''ngsfilter -t {params.dmulti[dat]} -u {output.unid} ../02_assembly/02_remove_unaligned/{params.dmulti[run]}.ali.fastq --fasta-output > {output.assign} 2> {log}'''
### Split the input sequence file in a set of subfiles according to the values of attribute `sample`
rule split_sequences:
input:
'01_assign_sequences/{demultiplex}.ali.assigned.fastq'
params:
dir='02_raw/{demultiplex}/'
singularity:
config["singularity"]["obitools"]
log:
'../99_log/03_demultiplex/02_split_sequences/{demultiplex}.log'
shell:
'''mkdir -p {params.dir}; obisplit -p "{params.dir}" -t sample --fasta {input} 2> {log}'''
### Dereplicate and merge samples together
rule dereplicate_runs:
input:
'01_runs/{run}.fasta'
output:
'02_dereplicated/{run}.uniq.fasta'
singularity:
config["singularity"]["obitools"]
log:
'../99_log/05_assignment/02_dereplicated/{run}.log'
shell:
'''obiuniq -m sample {input} > {output} 2> {log}'''
### dereplicate reads into uniq sequences
rule dereplicate_samples:
input:
'../03_demultiplex/02_raw/{demultiplexs}.fasta'
output:
'01_dereplicated/{demultiplexs}.uniq.fasta'
singularity:
config["singularity"]["obitools"]
log:
'../99_log/04_filter_samples/01_dereplicated/{demultiplexs}.log'
params:
dmulti= lambda wildcards: dfMultiChecked[dfMultiChecked.demultiplex == wildcards.demultiplexs].to_dict('records')[0],
shell:
'''mkdir -p 01_dereplicated/{params.dmulti[projmarkrun]}; obiuniq -m sample {input} > {output} 2> {log}'''
### only sequence more than 20bp with no ambiguity IUAPC with total coverage greater than 10 reads
rule goodlength_samples:
input:
'01_dereplicated/{demultiplexs}.uniq.fasta'
output:
'02_goodlength/{demultiplexs}.l.u.fasta'
singularity:
config["singularity"]["obitools"]
log:
'../99_log/04_filter_samples/02_goodlength/{demultiplexs}.log'
params:
count=config["good_length_samples"]["count"],
seq_length=config["good_length_samples"]["seq_length"]
shell:
'''obigrep -p 'count>{params.count}' -s '^[ACGT]+$' -p 'seq_length>{params.seq_length}' {input} > {output} 2> {log}'''
### Clean the sequences for PCR/sequencing errors (sequence variants)
rule clean_pcrerr_samples:
input:
'02_goodlength/{demultiplexs}.l.u.fasta'
output:
'03_clean_pcrerr/{demultiplexs}.r.l.u.fasta'
singularity:
config["singularity"]["obitools"]
log:
'../99_log/04_filter_samples/03_clean_pcrerr/{demultiplexs}.log'
params:
r=config["clean_pcrerr_samples"]["r"]
shell:
'''if [[ -s {input} ]]; then obiclean -r {params.r} {input} > {output} 2> {log} ; else touch {output} 2> {log} ; fi'''
### Remove sequence which are classified as 'internal' by obiclean
rule rm_internal_samples:
input:
'03_clean_pcrerr/{demultiplexs}.r.l.u.fasta'
output:
'04_filtered/{demultiplexs}.c.r.l.u.fasta'
params:
dmulti= lambda wildcards: dfMultiChecked[dfMultiChecked.demultiplex == wildcards.demultiplexs].to_dict('records')[0],
singularity:
config["singularity"]["obitools"]
log:
'../99_log/04_filter_samples/04_filtered/{demultiplexs}.log'
shell:
'''if [[ -s {input} ]]; then mkdir -p 04_filtered/{params.dmulti[projmarkrun]}; obigrep -p "obiclean_internalcount == 0" {input} > {output} 2> {log} ; else touch {output} 2> {log} ; fi'''
### Some unuseful attributes can be removed at this stage
rule rm_attributes:
input:
'03_assigned/{run}.tag.u.fasta'
output:
'04_formated/{run}.a.t.u.fasta'
singularity:
config["singularity"]["obitools"]
log:
'../99_log/05_assignment/04_rm_attributes/{run}.log'
shell:
'''obiannotate --delete-tag=scientific_name_by_db --delete-tag=obiclean_samplecount \
--delete-tag=obiclean_count --delete-tag=obiclean_singletoncount \
--delete-tag=obiclean_cluster --delete-tag=obiclean_internalcount \
--delete-tag=obiclean_head --delete-tag=obiclean_headcount \
--delete-tag=id_status --delete-tag=rank_by_db --delete-tag=obiclean_status \
--delete-tag=seq_length_ori --delete-tag=sminL --delete-tag=sminR \
--delete-tag=reverse_score --delete-tag=reverse_primer --delete-tag=reverse_match --delete-tag=reverse_tag \
--delete-tag=forward_tag --delete-tag=forward_score --delete-tag=forward_primer --delete-tag=forward_match \
--delete-tag=tail_quality {input} > {output} 2> {log}'''
### The sequences can be sorted by decreasing order of count
rule sort_runs:
input:
'04_formated/{run}.a.t.u.fasta'
output:
'04_formated/{run}.s.a.t.u.fasta'
singularity:
config["singularity"]["obitools"]
log:
'../99_log/05_assignment/05_sort_runs/{run}.log'
shell:
'''obisort -k count -r {input} > {output} 2> {log}'''
### Generate a table final results
rule table_runs:
input:
'04_formated/{run}.s.a.t.u.fasta'
output:
'../06_final_tables/{run}.csv'
singularity:
config["singularity"]["obitools"]
params:
drun= lambda wildcards: dfpmr[dfpmr.projmarkrun == wildcards.run].to_dict('records')[0],
log:
'../99_log/05_assignment/06_table_runs/{run}.log'
shell:
'''mkdir -p ../06_final_tables/{params.drun[projet]}/{params.drun[marker]}; obitab -o {input} > {output} 2> {log}'''
......@@ -35,5 +35,21 @@ blacklist:
- 180430_SN234_A_L001_AIMI-9
- 180504_SND405_A_L001_AIMI-10
- 180504_SND405_A_L002_AIMI-11
merge_fastq:
encoding: 33
illuminapairedend:
s_min: 40
good_length_samples:
count : 10
seq_length : 20
clean_pcrerr_samples:
r : 0.05
assign_taxon:
bdr:
mamm: /media/superdisk/edna/donnees/reference_database/reference_database_mamm/embl_std
teleo: /media/superdisk/edna/donnees/reference_database/reference_database_teleo/embl_std
chond: /media/superdisk/edna/donnees/reference_database/reference_database_chond/embl_std
vert: /media/superdisk/edna/donnees/reference_database/reference_database_vert/embl_std
fasta:
mamm: /media/superdisk/edna/donnees/reference_database/reference_database_mamm/db_embl_std.fasta
teleo: /media/superdisk/edna/donnees/reference_database/reference_database_teleo/db_embl_std.fasta
chond: /media/superdisk/edna/donnees/reference_database/reference_database_chond/db_embl_std.fasta
vert: /media/superdisk/edna/donnees/reference_database/reference_database_vert/db_embl_std.fasta
\ No newline at end of file
__author__ = "Pierre-Edouard Guerin"
__license__ = "MIT"
###############################################################################
# MODULES
###############################################################################
import pandas
###############################################################################
# GLOBAL VARIABLES
###############################################################################
#configfile: "01_infos/config.yaml"
rapidrunfile = "../"+config['fichiers']['rapidrun']
dfr =pandas.read_csv(rapidrunfile, sep="\t")
dfr.columns = ['plaque', 'run','sample','projet','marker']
## remove blacklisted projets
blacklistedProjets = config['blacklist']['projet']
dfrp= dfr[dfr.run.isin(blacklistedProjets) == False]
## remove blacklisted runs
blacklistedRuns = config['blacklist']['run']
dfrm= dfrp[dfrp.run.isin(blacklistedRuns) == False]
uniqRuns=dfrm.run.unique()
###############################################################################
# RULES
###############################################################################
rule all:
input:
expand("{folder}{run}_R1.fastq.gz", run=uniqRuns,folder=config["fichiers"]["folder_fastq"]),
expand('01_illuminapairedend/{run}.fastq', run=uniqRuns),
expand('02_remove_unaligned/{run}.ali.fastq', run=uniqRuns),
expand('../99_log/02_assembly/02_remove_unaligned/{run}.log',run=uniqRuns),
expand('../99_log/02_assembly/01_illuminapairedend/{run}.log',run=uniqRuns)
include: "../00_rules/assembly.smk"
__author__ = "Pierre-Edouard Guerin"
__license__ = "MIT"
###############################################################################
# MODULES
###############################################################################
import pandas
import glob
import os
###############################################################################
# FUNCTIONS
###############################################################################
def str_join(df, sep, *cols):
from functools import reduce
return reduce(lambda x, y: x.astype(str).str.cat(y.astype(str), sep=sep), [df[col] for col in cols])
###############################################################################
# GLOBAL VARIABLES
###############################################################################
#configfile: "01_infos/config.yaml"
dfMulti =pandas.read_csv("../01_infos/all_demultiplex.csv", sep=",")
rapidrunfile = "../"+config['fichiers']['rapidrun']
#rapidrunfile="../01_infos/all_samples.tsv"
dfr =pandas.read_csv(rapidrunfile, sep="\t")
dfr.columns = ['plaque', 'run','sample','projet','marker']
## remove blacklisted projets
blacklistedProjets = config['blacklist']['projet']
dfrp= dfr[dfr.run.isin(blacklistedProjets) == False]
## remove blacklisted runs
blacklistedRuns = config['blacklist']['run']
dfrm= dfrp[dfrp.run.isin(blacklistedRuns) == False]
#dfrm=dfr
## keep only marker and run information for demultiplexing
dfRunMarker=dfrm[['marker','run']]
dfRunMarker['runMarker']=str_join(dfrm, '/', 'marker', 'run')
dfRunMarker['projMarker']=str_join(dfrm, '/', 'projet', 'marker')
dfRunMarker=dfRunMarker.drop_duplicates()
#dfRunMarker['dat']=dfRunMarker['marker'].map(dicMark)
#dfRunMarker['dat']=dfRunMarker['marker'].map(config['fichiers']['dat'])
with pandas.option_context('display.max_rows', None, 'display.max_columns', None, 'max_colwidth', 1800):
print(dfRunMarker)
listRunMarker = list(dfRunMarker.runMarker)
#dfMulti=dfMulti[(dfMulti.projet != 'Guadeloupe')]
dfRunMarker['projMarkRun'] = dfRunMarker['projMarker']+"/"+dfRunMarker['run']
## write .dat files for ngsfilter for each projet/marker
for projmarkerrun in dfRunMarker['projMarkRun'].unique():
thisProj = projmarkerrun.split("/")[0]
thisMarker = projmarkerrun.split("/")[1]
thisRun = projmarkerrun.split("/")[2]
fileName = "00_dat/" + str(thisProj) + "_" + str(thisMarker) + "_" + str(thisRun) + ".tsv"
dfprojmarkerrunMulti = dfMulti[(dfMulti.projet == thisProj) & (dfMulti.marker == thisMarker) & (dfMulti.run == thisRun)]
if dfprojmarkerrunMulti.empty == False:
print("Writing sample description files for project and marker", thisProj, thisMarker, ":", fileName)
thisExp = list(dfprojmarkerrunMulti['projet'])
thisSample = list(dfprojmarkerrunMulti['sample'])
thisTags = list(dfprojmarkerrunMulti['barcode5'])
ThisForward_primer = list(dfprojmarkerrunMulti['primer5'])
ThisReverse_primer = list(dfprojmarkerrunMulti['primer3'])
This_extra_info = list(pandas.Series(["F"]).repeat(len(thisExp)))
thisDat = { '#exp': thisExp, 'sample': thisSample, 'tags': thisTags, 'forward_primer': ThisForward_primer, 'reverse_primer': ThisReverse_primer, 'extra_information': This_extra_info }
dfDat = pandas.DataFrame(thisDat, columns = ['#exp', 'sample', 'tags', 'forward_primer', 'reverse_primer', 'extra_information'])
dfDat = dfDat.drop_duplicates()
#print(dfDat)
## check if files already exists
files_present = glob.glob(fileName)
if files_present:
os.remove(fileName)
dfDat.to_csv(fileName, index=False, sep="\t")
else:
print("Writing sample description files for project and marker", thisProj, thisMarker, ":", "WARNING no information about it, check the file", rapidrunfile, "or blacklist this project")
dfRunMarker['dat'] = ["00_dat/"+ele.replace('/','_')+".tsv" for ele in dfRunMarker['projMarkRun']]
rule all:
input:
expand('01_assign_sequences/{demultiplex}.ali.assigned.fastq', demultiplex=dfRunMarker.projMarkRun),
expand('01_assign_sequences/{demultiplex}.unidentified.fastq', demultiplex=dfRunMarker.projMarkRun),
expand('../99_log/03_demultiplex/01_assign_sequences/{demultiplex}.log', demultiplex=dfRunMarker.projMarkRun),
expand('../99_log/03_demultiplex/02_split_sequences/{demultiplex}.log', demultiplex=dfRunMarker.projMarkRun)
include: "../00_rules/demultiplex.smk"
__author__ = "Pierre-Edouard Guerin"
__license__ = "MIT"
#configfile: "../config.yaml"
import pandas
import os.path
dfMulti = pandas.read_csv("../01_infos/all_demultiplex.csv", sep=",")
## check demultiplexing results
dfMultiChecked = dfMulti
for thisDemultiplex in dfMulti.demultiplex:
file_sample = "../03_demultiplex/02_raw/"+thisDemultiplex+".fasta"
if not os.path.exists(file_sample):
print("WARNING: ", file_sample," not found. We removed it from this analysis.")
dfMultiChecked = dfMultiChecked[dfMultiChecked.demultiplex != thisDemultiplex]
print(dfMultiChecked)
rule all:
input:
expand('01_dereplicated/{demultiplexs}.uniq.fasta', demultiplexs=dfMultiChecked['demultiplex']),
expand('02_goodlength/{demultiplexs}.l.u.fasta', demultiplexs=dfMultiChecked['demultiplex']),
expand('03_clean_pcrerr/{demultiplexs}.r.l.u.fasta', demultiplexs=dfMultiChecked['demultiplex']),
expand('04_filtered/{demultiplexs}.c.r.l.u.fasta', demultiplexs=dfMultiChecked['demultiplex']),
expand('../99_log/04_filter_samples/01_dereplicated/{demultiplexs}.log', demultiplexs=dfMultiChecked['demultiplex']),
expand('../99_log/04_filter_samples/02_goodlength/{demultiplexs}.log', demultiplexs=dfMultiChecked['demultiplex']),
expand('../99_log/04_filter_samples/03_clean_pcrerr/{demultiplexs}.log', demultiplexs=dfMultiChecked['demultiplex']),
expand('../99_log/04_filter_samples/04_filtered/{demultiplexs}.log', demultiplexs=dfMultiChecked['demultiplex'])
include: "../00_rules/filter_samples.smk"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment