Commit aedac20f authored by peguerin's avatar peguerin
Browse files

add fastqc rule

parent 6d0e4250
...@@ -83,6 +83,8 @@ def makeDirectory(path_to_directory): ...@@ -83,6 +83,8 @@ def makeDirectory(path_to_directory):
def mkdir_results(results_subfolders): def mkdir_results(results_subfolders):
if not config['fastqc']:
del results_subfolders['fastq_quality_control']
for k in results_subfolders: for k in results_subfolders:
subfolder=results_subfolders[k] subfolder=results_subfolders[k]
if not os.path.exists(os.path.join("results",subfolder)): if not os.path.exists(os.path.join("results",subfolder)):
...@@ -150,7 +152,7 @@ def str_join(df, sep, *cols): ...@@ -150,7 +152,7 @@ def str_join(df, sep, *cols):
# MAIN # MAIN
############################################################################### ###############################################################################
## generate results subfolders
mkdir_results(results_subfolders) mkdir_results(results_subfolders)
## check format (CLASSIC or RAPIDRUN) ## check format (CLASSIC or RAPIDRUN)
...@@ -170,8 +172,8 @@ if config['format'] == "CLASSIC": ...@@ -170,8 +172,8 @@ if config['format'] == "CLASSIC":
} }
dfrClassic = dfrClassic.append(thisRow, ignore_index=True) dfrClassic = dfrClassic.append(thisRow, ignore_index=True)
print(dfrClassic) print(dfrClassic)
rapidrunfile=results_subfolders['settings']+'/all_samples_classic.csv' rapidrunfile='results/'+results_subfolders['settings']+'/all_samples_classic.csv'
export_allsample = dfrClassic.to_csv (r'./'+rapidrunfile, index = None, header = False, sep = ';') export_allsample = dfrClassic.to_csv (r'{}'.format(rapidrunfile), index = None, header = False, sep = ';')
else: else:
print("RAPIDRUN data: many markers for many runs") print("RAPIDRUN data: many markers for many runs")
#configfile: "01_infos/config.yaml" #configfile: "01_infos/config.yaml"
...@@ -249,7 +251,7 @@ for run in uniqRuns: ...@@ -249,7 +251,7 @@ for run in uniqRuns:
dfMulti = dfMulti.append( thisRow, ignore_index=True) dfMulti = dfMulti.append( thisRow, ignore_index=True)
demultiplexFile='results/'+results_subfolders['settings']+'/all_demultiplex.csv' demultiplexFile='results/'+results_subfolders['settings']+'/all_demultiplex.csv'
export_csv = dfMulti.to_csv (r'./'+demultiplexFile, index = None, header=True,sep=",") export_csv = dfMulti.to_csv (r'{}'.format(demultiplexFile), index = None, header=True,sep=",")
print (dfMulti) print (dfMulti)
...@@ -285,10 +287,10 @@ uniqRuns=dfrm.run.unique() ...@@ -285,10 +287,10 @@ uniqRuns=dfrm.run.unique()
## projmarkrun wildcards ## projmarkrun wildcards
#dfMultiProjMarkRunSample=dfMulti[['projet','marker','run','demultiplex']] #dfMultiProjMarkRunSample=dfMulti[['projet','marker','run','demultiplex']]
dfMultiProjMarkRunSample=dfMulti.loc[:, ('projet','marker','run','demultiplex')] dfMultiProjMarkRunSample = dfMulti.loc[:, ('projet','marker','run','demultiplex')]
dfMultiProjMarkRunSample["demultiplex"]="results/"+results_subfolders['demultiplex_tag']+"/samples/"+dfMultiProjMarkRunSample["demultiplex"]+".fastq" dfMultiProjMarkRunSample["demultiplex"]="results/"+results_subfolders['demultiplex_tag']+"/samples/"+dfMultiProjMarkRunSample["demultiplex"]+".fastq"
#dfMultiProjMarkRun=dfMulti[['projet','marker','run']] #dfMultiProjMarkRun=dfMulti[['projet','marker','run']]
dfMultiProjMarkRun=dfMulti.loc[:, ('projet','marker','run')] dfMultiProjMarkRun = dfMulti.loc[:, ('projet','marker','run')]
dfMultiProjMarkRun['projMarkRun']=str_join(dfMultiProjMarkRun, '/', 'projet', 'marker', 'run') dfMultiProjMarkRun['projMarkRun']=str_join(dfMultiProjMarkRun, '/', 'projet', 'marker', 'run')
dfMultiProjMarkRunSample['projMarkRun']=str_join(dfMultiProjMarkRunSample, '/', 'projet', 'marker', 'run') dfMultiProjMarkRunSample['projMarkRun']=str_join(dfMultiProjMarkRunSample, '/', 'projet', 'marker', 'run')
dfMultiProjMarkRun = dfMultiProjMarkRun.drop_duplicates() dfMultiProjMarkRun = dfMultiProjMarkRun.drop_duplicates()
...@@ -296,18 +298,29 @@ print(dfMultiProjMarkRun) ...@@ -296,18 +298,29 @@ print(dfMultiProjMarkRun)
# projmark wildcards # projmark wildcards
#dfMultiProjMark=dfMulti[['projet','marker']] #dfMultiProjMark=dfMulti[['projet','marker']]
dfMultiProjMark=dfMulti.loc[:, ('projet','marker')] dfMultiProjMark = dfMulti.loc[:, ('projet','marker')]
dfMultiProjMark['projMark']=str_join(dfMultiProjMark, '/', 'projet', 'marker') dfMultiProjMark['projMark'] = str_join(dfMultiProjMark, '/', 'projet', 'marker')
dfMultiProjMark = dfMultiProjMark.drop_duplicates() dfMultiProjMark = dfMultiProjMark.drop_duplicates()
ruleAllInputList = [
'results/'+results_subfolders['flags']+'/fastq_quality_control.flag',
'results/'+results_subfolders['flags']+'/table_assigned_sequences.flag'
]
if not config['fastqc']:
ruleAllInputList.remove('results/'+results_subfolders['flags']+'/fastq_quality_control.flag')
rule all: rule all:
input: input:
'results/'+results_subfolders['flags']+'/table_assigned_sequences.flag' ruleAllInputList
include: "rules/merge_fastq.smk" if config['fastqc']:
include: "rules/fastq_quality_control.smk"
include: "rules/merge_fastq_after_fastqc.smk"
else:
include: "rules/merge_fastq.smk"
include: "rules/demultiplex_tag.smk" include: "rules/demultiplex_tag.smk"
......
__author__ = "Pierre-Edouard Guerin"
__license__ = "MIT"
## Pool sequences and quality files
rule fastq_quality_control:
input:
config["fichiers"]["folder_fastq"]+'{runR}.fastq.gz'
output:
'results/'+results_subfolders['fastq_quality_control']+'/{runR}_fastqc.zip'
params:
resFolder='results/'+results_subfolders['fastq_quality_control'],
conda:
'../envs/env_fastqc.yaml'
resources:
job=1
log:
'logs/'+results_subfolders['fastq_quality_control']+'/{runR}.log'
threads:
1
shell:
'''
fastqc --outdir {params.resFolder} {input} 2> {log}
'''
rule checkpoint_fastqc:
input:
expand('results/'+results_subfolders['fastq_quality_control']+'/{runR}_fastqc.zip', runR=[str(r)+"_R2" for r in uniqRuns]+[str(r)+"_R1" for r in uniqRuns])
output:
touch('results/'+results_subfolders['flags']+'/fastq_quality_control.flag')
__author__ = "Pierre-Edouard Guerin"
__license__ = "MIT"
## Merge read pairs but we have to trick fastqc output first
rule merge_fastq_after_fastqc:
input:
'results/'+results_subfolders['flags']+'/fastq_quality_control.flag'
output:
fq='results/'+results_subfolders['merge_fastq']+'/{run}.fastq'
singularity:
config["singularity"]["ednatools"]
conda:
'../envs/env_vsearch.yaml'
threads:
config["merging"]["vsearch"]["cores"]
resources:
job=1
log:
'logs/'+results_subfolders['merge_fastq']+'/{run}.log'
params:
R1=config["fichiers"]["folder_fastq"]+'{run}_R1.fastq.gz',
R2=config["fichiers"]["folder_fastq"]+'{run}_R2.fastq.gz',
encoding=config["merge_fastq"]["encoding"]
shell:
'''ls {input};
vsearch \
--threads {threads} \
--fastq_mergepairs {params.R1} \
--reverse {params.R2} \
--fastq_ascii {params.encoding} \
--fastqout {output} \
--fastq_allowmergestagger \
--quiet 2> {log}'''
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment