Commit aedac20f authored by peguerin's avatar peguerin
Browse files

add fastqc rule

parent 6d0e4250
......@@ -83,6 +83,8 @@ def makeDirectory(path_to_directory):
def mkdir_results(results_subfolders):
if not config['fastqc']:
del results_subfolders['fastq_quality_control']
for k in results_subfolders:
subfolder=results_subfolders[k]
if not os.path.exists(os.path.join("results",subfolder)):
......@@ -150,7 +152,7 @@ def str_join(df, sep, *cols):
# MAIN
###############################################################################
## generate results subfolders
mkdir_results(results_subfolders)
## check format (CLASSIC or RAPIDRUN)
......@@ -170,8 +172,8 @@ if config['format'] == "CLASSIC":
}
dfrClassic = dfrClassic.append(thisRow, ignore_index=True)
print(dfrClassic)
rapidrunfile=results_subfolders['settings']+'/all_samples_classic.csv'
export_allsample = dfrClassic.to_csv (r'./'+rapidrunfile, index = None, header = False, sep = ';')
rapidrunfile='results/'+results_subfolders['settings']+'/all_samples_classic.csv'
export_allsample = dfrClassic.to_csv (r'{}'.format(rapidrunfile), index = None, header = False, sep = ';')
else:
print("RAPIDRUN data: many markers for many runs")
#configfile: "01_infos/config.yaml"
......@@ -249,7 +251,7 @@ for run in uniqRuns:
dfMulti = dfMulti.append( thisRow, ignore_index=True)
demultiplexFile='results/'+results_subfolders['settings']+'/all_demultiplex.csv'
export_csv = dfMulti.to_csv (r'./'+demultiplexFile, index = None, header=True,sep=",")
export_csv = dfMulti.to_csv (r'{}'.format(demultiplexFile), index = None, header=True,sep=",")
print (dfMulti)
......@@ -285,10 +287,10 @@ uniqRuns=dfrm.run.unique()
## projmarkrun wildcards
#dfMultiProjMarkRunSample=dfMulti[['projet','marker','run','demultiplex']]
dfMultiProjMarkRunSample=dfMulti.loc[:, ('projet','marker','run','demultiplex')]
dfMultiProjMarkRunSample = dfMulti.loc[:, ('projet','marker','run','demultiplex')]
dfMultiProjMarkRunSample["demultiplex"]="results/"+results_subfolders['demultiplex_tag']+"/samples/"+dfMultiProjMarkRunSample["demultiplex"]+".fastq"
#dfMultiProjMarkRun=dfMulti[['projet','marker','run']]
dfMultiProjMarkRun=dfMulti.loc[:, ('projet','marker','run')]
dfMultiProjMarkRun = dfMulti.loc[:, ('projet','marker','run')]
dfMultiProjMarkRun['projMarkRun']=str_join(dfMultiProjMarkRun, '/', 'projet', 'marker', 'run')
dfMultiProjMarkRunSample['projMarkRun']=str_join(dfMultiProjMarkRunSample, '/', 'projet', 'marker', 'run')
dfMultiProjMarkRun = dfMultiProjMarkRun.drop_duplicates()
......@@ -296,18 +298,29 @@ print(dfMultiProjMarkRun)
# projmark wildcards
#dfMultiProjMark=dfMulti[['projet','marker']]
dfMultiProjMark=dfMulti.loc[:, ('projet','marker')]
dfMultiProjMark['projMark']=str_join(dfMultiProjMark, '/', 'projet', 'marker')
dfMultiProjMark = dfMulti.loc[:, ('projet','marker')]
dfMultiProjMark['projMark'] = str_join(dfMultiProjMark, '/', 'projet', 'marker')
dfMultiProjMark = dfMultiProjMark.drop_duplicates()
rule all:
input:
ruleAllInputList = [
'results/'+results_subfolders['flags']+'/fastq_quality_control.flag',
'results/'+results_subfolders['flags']+'/table_assigned_sequences.flag'
]
if not config['fastqc']:
ruleAllInputList.remove('results/'+results_subfolders['flags']+'/fastq_quality_control.flag')
include: "rules/merge_fastq.smk"
rule all:
input:
ruleAllInputList
if config['fastqc']:
include: "rules/fastq_quality_control.smk"
include: "rules/merge_fastq_after_fastqc.smk"
else:
include: "rules/merge_fastq.smk"
include: "rules/demultiplex_tag.smk"
......
__author__ = "Pierre-Edouard Guerin"
__license__ = "MIT"
## Pool sequences and quality files
rule fastq_quality_control:
input:
config["fichiers"]["folder_fastq"]+'{runR}.fastq.gz'
output:
'results/'+results_subfolders['fastq_quality_control']+'/{runR}_fastqc.zip'
params:
resFolder='results/'+results_subfolders['fastq_quality_control'],
conda:
'../envs/env_fastqc.yaml'
resources:
job=1
log:
'logs/'+results_subfolders['fastq_quality_control']+'/{runR}.log'
threads:
1
shell:
'''
fastqc --outdir {params.resFolder} {input} 2> {log}
'''
rule checkpoint_fastqc:
input:
expand('results/'+results_subfolders['fastq_quality_control']+'/{runR}_fastqc.zip', runR=[str(r)+"_R2" for r in uniqRuns]+[str(r)+"_R1" for r in uniqRuns])
output:
touch('results/'+results_subfolders['flags']+'/fastq_quality_control.flag')
__author__ = "Pierre-Edouard Guerin"
__license__ = "MIT"
## Merge read pairs but we have to trick fastqc output first
rule merge_fastq_after_fastqc:
input:
'results/'+results_subfolders['flags']+'/fastq_quality_control.flag'
output:
fq='results/'+results_subfolders['merge_fastq']+'/{run}.fastq'
singularity:
config["singularity"]["ednatools"]
conda:
'../envs/env_vsearch.yaml'
threads:
config["merging"]["vsearch"]["cores"]
resources:
job=1
log:
'logs/'+results_subfolders['merge_fastq']+'/{run}.log'
params:
R1=config["fichiers"]["folder_fastq"]+'{run}_R1.fastq.gz',
R2=config["fichiers"]["folder_fastq"]+'{run}_R2.fastq.gz',
encoding=config["merge_fastq"]["encoding"]
shell:
'''ls {input};
vsearch \
--threads {threads} \
--fastq_mergepairs {params.R1} \
--reverse {params.R2} \
--fastq_ascii {params.encoding} \
--fastqout {output} \
--fastq_allowmergestagger \
--quiet 2> {log}'''
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment