Commit b276542b authored by peguerin's avatar peguerin
Browse files

factoriser la creation des dossiers

parent f86120ac
......@@ -31,17 +31,61 @@ wildcards en related information necessary to run the next workflow steps.
###############################################################################
# MODULES
###############################################################################
## Standard library imports
from pathlib import Path
import os
## Third party imports
import pandas
from Bio.Seq import Seq
import os
###############################################################################
# GLOBAL VARIABLES
###############################################################################
results_subfolders = {
'flags': '00_flags',
'settings': '01_settings',
'merge_fastq': '02_merge_fastq',
'demultiplex_tag': '03_demultiplex_tag',
'trim_primers': '04_trim_primers',
'discard_iupac_ambiguous_sequences': '05_discard_iupac_ambiguous_sequences',
'sample_dereplicate_sequences': '06_sample_dereplicate_sequences',
'flag_projmarkrun': '07_flag_projmarkrun',
'cat_quality': '08_cat_quality',
'filter_quality': '09_filter_quality',
'hash_quality': '10_hash_quality',
'sort_quality': '11_sort_quality',
'flag_projmark': '12_flag_projmark',
'projetmarker_cat_quality': '13_projetmarker_cat_quality',
'projetmarker_cat_fasta': '14_projetmarker_cat_fasta',
'projetmarker_dereplicate_sequences': '15_projetmarker_dereplicate_sequences',
'projetmarker_cluster_sequences': '16_projetmarker_cluster_sequences',
'sort_representatives':'17_sort_representatives',
'remove_chimera_sequences':'18_remove_chimera_sequences',
'otu_table':'19_otu_table',
'otu_fasta':'20_otu_fasta',
'taxonomic_assignment':'21_taxonomic_assignment',
'remove_annotations':'22_remove_annotations',
'sort_abundance_assigned_sequences':'23_sort_abundance_assigned_sequences',
'table_assigned_sequences':'24_table_assigned_sequences'
}
###############################################################################
# FUNCTIONS
###############################################################################
def makeDirectory(path_to_directory):
p = Path(path_to_directory)
p.mkdir(exist_ok=True, parents=True)
def mkdir_results(results_subfolders):
for subfolder in results_subfolders:
for k in results_subfolders:
subfolder=results_subfolders[k]
if not os.path.exists(os.path.join("results",subfolder)):
os.mkdir(os.path.join("results",subfolder))
os.makedirs(os.path.join("results",subfolder))
def is_dna_sequence(dna_string, alphabet):
for base in dna_string:
......@@ -67,26 +111,26 @@ def read_dat(filedat):
return dfdat
def write_barcodes_fasta(dfMulti):
if not os.path.exists('results/01_settings/barcodes/'):
os.mkdir('results/01_settings/barcodes/')
def write_barcodes_fasta(dfMulti, results_subfolders):
barcodes_subfolder="results/"+results_subfolders['settings']+"/barcodes/"
makeDirectory(barcodes_subfolder)
runUniq = dfMulti['run'].unique()
for j in range(0,runUniq.size):
thisRunName = runUniq[j]
dfMultiRun = dfMulti[dfMulti['run'] == thisRunName]
#duplicatesBarcodeSamples = dfMultiRun[dfMultiRun.duplicated(['barcode5','barcode3'])]
## list duplicated barcodes 5' 3'
dicDuplicateBarcodes = {'demultiplex': dfMultiRun['demultiplex'], 'linked_barcode': dfMultiRun['barcode5']+"..."+dfMultiRun['barcode3']}
dicDuplicateBarcodes = {'demultiplex': dfMultiRun['demultiplex'], 'linked_barcode': dfMultiRun['barcode5']+'...'+dfMultiRun['barcode3']}
dfDuplicateBarcodes = pandas.DataFrame(data=dicDuplicateBarcodes)
dfDuplicateBarcodesJoin = dfDuplicateBarcodes.groupby(['linked_barcode'], as_index=False).agg(','.join)
dfMultiDuplicate = dfDuplicateBarcodesJoin[dfDuplicateBarcodesJoin['demultiplex'].str.contains(',')]['demultiplex'].str.split(",",expand=True,)
dfMultiDuplicate = "results/03_demultiplex_tag/"+dfMultiDuplicate+".fastq"
dfMultiDuplicate.to_csv (r'results/01_settings/barcodes/'+thisRunName+'_duplicated.csv', index = None, header=None)
dfMultiDuplicate = dfDuplicateBarcodesJoin[dfDuplicateBarcodesJoin['demultiplex'].str.contains(',')]['demultiplex'].str.split(',',expand=True,)
dfMultiDuplicate = 'results/'+results_subfolders['demultiplex_tag']+'/'+dfMultiDuplicate+'.fastq'
dfMultiDuplicate.to_csv (r'./'+barcodes_subfolder+thisRunName+'_duplicated.csv', index = None, header=None)
#dfMultiRun = dfMultiRun[~dfMultiRun['demultiplex'].isin(duplicatesBarcodeSamples['demultiplex'])]
barcodeFilePath='results/01_settings/barcodes/'+str(thisRunName)+'.fasta'
barcodeFilePath=barcodes_subfolder+str(thisRunName)+'.fasta'
barcodeFile = open(barcodeFilePath, 'w')
for index, row in dfMultiRun.iterrows():
SampleFolderPath = 'results/03_demultiplex_tag/'+'/'.join(str(row['demultiplex']).split('/')[:-1])
SampleFolderPath = 'results/'+results_subfolders['demultiplex_tag']+'/'+'/'.join(str(row['demultiplex']).split('/')[:-1])
sampleName = str(row['demultiplex']).split('/')[-1]
fasta_header=">"+ SampleFolderPath+"/"+sampleName
os.makedirs(SampleFolderPath, exist_ok=True)
......@@ -101,36 +145,10 @@ def str_join(df, sep, *cols):
return reduce(lambda x, y: x.astype(str).str.cat(y.astype(str), sep=sep), [df[col] for col in cols])
###############################################################################
# GLOBAL VARIABLES
# MAIN
###############################################################################
results_subfolders = ['01_settings',
'02_merge_fastq',
'03_demultiplex_tag',
'04_trim_primers',
'05_discard_iupac_ambiguous_sequences',
'06_sample_dereplicate_sequences',
'07_flag_projmarkrun',
'08_cat_quality',
'09_filter_quality',
'10_hash_quality',
'11_sort_quality',
'12_flag_projmark',
'13_projetmarker_cat_quality',
'14_projetmarker_cat_fasta',
'15_projetmarker_dereplicate_sequences',
'16_projetmarker_cluster_sequences',
'17_sort_representatives',
'18_remove_chimera_sequences',
'19_otu_table',
'20_otu_fasta',
'21_taxonomic_assignment',
'22_remove_annotations',
'23_sort_abundance_assigned_sequences',
'24_table_assigned_sequences']
mkdir_results(results_subfolders)
......@@ -151,8 +169,8 @@ if config['format'] == "CLASSIC":
}
dfrClassic = dfrClassic.append(thisRow, ignore_index=True)
print(dfrClassic)
export_allsample = dfrClassic.to_csv (r'results/01_settings/all_samples_classic.csv', index = None, header = False, sep = ';')
rapidrunfile="results/01_settings/all_samples_classic.csv"
rapidrunfile=results_subfolders['settings']+'/all_samples_classic.csv'
export_allsample = dfrClassic.to_csv (r'./'+rapidrunfile, index = None, header = False, sep = ';')
else:
print("RAPIDRUN data: many markers for many runs")
#configfile: "01_infos/config.yaml"
......@@ -229,14 +247,15 @@ for run in uniqRuns:
#print(thisRow)
dfMulti = dfMulti.append( thisRow, ignore_index=True)
export_csv = dfMulti.to_csv (r'results/01_settings/all_demultiplex.csv', index = None, header=True)
demultiplexFile='results/'+results_subfolders['settings']+'/all_demultiplex.csv'
export_csv = dfMulti.to_csv (r'./'+demultiplexFile, index = None, header=True,sep=",")
print (dfMulti)
## write barcodes fasta files for demultiplexing with cutadapt
dfMulti =pandas.read_csv("results/01_settings/all_demultiplex.csv", sep=",")
write_barcodes_fasta(dfMulti)
dfMulti =pandas.read_csv(demultiplexFile, sep=",")
write_barcodes_fasta(dfMulti, results_subfolders)
####################################### step2
......@@ -244,9 +263,9 @@ write_barcodes_fasta(dfMulti)
if config['format'] != "CLASSIC":
rapidrunfile = config['fichiers']['rapidrun']
else:
rapidrunfile="results/01_settings/all_samples_classic.csv"
rapidrunfile='results/'+results_subfolders['settings']+'/all_samples_classic.csv'
if os.path.isfile(rapidrunfile) is not True:
raise Exception("ERROR: "+rapidrunfile+" is not a file. You must run step 01_settings first in order to generate this file for the CLASSIC format.")
raise Exception("ERROR: "+rapidrunfile+" is not a file. You must run the settings rule first in order to generate this file for the CLASSIC format.")
......@@ -266,7 +285,7 @@ uniqRuns=dfrm.run.unique()
## projmarkrun wildcards
#dfMultiProjMarkRunSample=dfMulti[['projet','marker','run','demultiplex']]
dfMultiProjMarkRunSample=dfMulti.loc[:, ('projet','marker','run','demultiplex')]
dfMultiProjMarkRunSample["demultiplex"]="results/"+results_subfolders[2]+"/samples/"+dfMultiProjMarkRunSample["demultiplex"]+".fastq"
dfMultiProjMarkRunSample["demultiplex"]="results/"+results_subfolders['demultiplex_tag']+"/samples/"+dfMultiProjMarkRunSample["demultiplex"]+".fastq"
#dfMultiProjMarkRun=dfMulti[['projet','marker','run']]
dfMultiProjMarkRun=dfMulti.loc[:, ('projet','marker','run')]
dfMultiProjMarkRun['projMarkRun']=str_join(dfMultiProjMarkRun, '/', 'projet', 'marker', 'run')
......@@ -284,27 +303,27 @@ dfMultiProjMark = dfMultiProjMark.drop_duplicates()
rule all:
input:
'results/00_flags/table_assigned_sequences.flag'
'results/'+results_subfolders['flags']+'/table_assigned_sequences.flag'
include: "rules/merge_fastq.smk"
include: "rules/demultiplex_tag.smk"
####################################### step3
rule flag_demultiplex_tag:
input:
expand('results/03_demultiplex_tag/flags/{run}_demultiplex_tag.done', run=uniqRuns)
expand('results/'+results_subfolders['demultiplex_tag']+'/flags/{run}_demultiplex_tag.done', run=uniqRuns)
output:
touch('results/00_flags/demultiplex_tag.flag')
touch('results/'+results_subfolders['flags']+'/demultiplex_tag.flag')
rule checkpoint_demultiplexed_samples_fastq:
input:
'results/00_flags/demultiplex_tag.flag'
'results/'+results_subfolders['flags']+'/demultiplex_tag.flag'
output:
expand('results/03_demultiplex_tag/samples/{demultiplex}.fastq', demultiplex=dfMulti.demultiplex)
run:
expand('results/'+results_subfolders['demultiplex_tag']+'/samples/{demultiplex}.fastq', demultiplex=dfMulti.demultiplex)
run:
for dem in dfMulti.demultiplex:
os.replace("results/03_demultiplex_tag/"+dem+".fastq", "results/03_demultiplex_tag/samples/"+dem+".fastq")
os.replace('results/'+results_subfolders['demultiplex_tag']+'/'+dem+'.fastq', 'results/'+results_subfolders['demultiplex_tag']+'/samples/'+dem+'.fastq')
include: "rules/trim_primers.smk"
include: "rules/discard_iupac_ambiguous_sequences.smk"
......@@ -312,18 +331,18 @@ include: "rules/sample_dereplicate_sequences.smk"
rule flag_sample_dereplicate_sequences:
input:
expand("results/06_sample_dereplicate_sequences/{demultiplex}.fasta", demultiplex=dfMulti.demultiplex)
expand('results/'+results_subfolders['sample_dereplicate_sequences']+'/{demultiplex}.fasta', demultiplex=dfMulti.demultiplex)
output:
touch('results/00_flags/sample_dereplicate_sequences.flag')
touch('results/'+results_subfolders['flags']+'/sample_dereplicate_sequences.flag')
rule flag_projmarkrun:
input:
'results/00_flags/sample_dereplicate_sequences.flag'
'results/'+results_subfolders['flags']+'/sample_dereplicate_sequences.flag'
output:
expand("results/07_flag_projmarkrun/{projmarkrun}.flag", projmarkrun=dfMultiProjMarkRun.projMarkRun)
expand('results/'+results_subfolders['flag_projmarkrun']+'/{projmarkrun}.flag', projmarkrun=dfMultiProjMarkRun.projMarkRun)
run:
for pmr in dfMultiProjMarkRun.projMarkRun:
flagFile = "results/07_flag_projmarkrun/"+pmr+".flag"
flagFile = 'results/'+results_subfolders['flag_projmarkrun']+'/'+pmr+'.flag'
open(flagFile, 'a').close()
include: 'rules/cat_quality.smk'
......@@ -333,19 +352,18 @@ include: 'rules/sort_quality.smk'
rule flag_sort_quality:
input:
expand("results/11_sort_quality/{projmarkrun}.qual", projmarkrun=dfMultiProjMarkRun.projMarkRun)
expand('results/'+results_subfolders['sort_quality']+'/{projmarkrun}.qual', projmarkrun=dfMultiProjMarkRun.projMarkRun)
output:
touch('results/00_flags/sort_quality.flag')
touch('results/'+results_subfolders['flags']+'/sort_quality.flag')
### step 5
rule flag_projmark:
input:
'results/00_flags/sort_quality.flag'
'results/'+results_subfolders['flags']+'/sort_quality.flag'
output:
expand("results/12_flag_projmark/{projmark}/flag", projmark=dfMultiProjMark.projMark)
expand('results/'+results_subfolders['flag_projmark']+'/{projmark}/flag', projmark=dfMultiProjMark.projMark)
run:
for pm in dfMultiProjMark.projMark:
flagFile = "results/12_flag_projmark/"+pm+"/flag"
flagFile = 'results/'+results_subfolders['flag_projmark']+'/'+pm+"/flag"
open(flagFile, 'a').close()
include: "rules/projetmarker_cat_quality.smk"
......@@ -358,7 +376,6 @@ include: "rules/otu_table.smk"
include: "rules/otu_fasta.smk"
## step6
# only one single wildcard {marker} in CLASSIC mode
if config['format'] == "CLASSIC":
thisMarker=str(list(config["reference"])[0])
......@@ -375,18 +392,18 @@ include: 'rules/table_assigned_sequences.smk'
## copy and rename final results
rule flag_table_assigned_sequences:
input:
expand("results/24_table_assigned_sequences/{projmark}/otu.tag.genbank.ann.sort.csv", projmark=dfMultiProjMark.projMark)
expand('results/'+results_subfolders['table_assigned_sequences']+'/{projmark}/otu.tag.genbank.ann.sort.csv', projmark=dfMultiProjMark.projMark)
output:
touch('results/00_flags/table_assigned_sequences.flag')
touch('results/'+results_subfolders['flags']+'/table_assigned_sequences.flag')
run:
print("Final results...", end='')
for pm in dfMultiProjMark.projMark:
thisProjet = pm.split('/')[0]
thisMarker = pm.split('/')[1]
tableNameFile = "results/19_otu_table/"+pm+"/all.otu.csv"
newTableNameFile = "results/"+thisProjet+"_"+thisMarker+"_table_motu.csv"
ecotagNameFile = "results/24_table_assigned_sequences/"+pm+"/otu.tag.genbank.ann.sort.csv"
newEcotagNameFile = "results/"+thisProjet+"_"+thisMarker+"_ecotag_ncbi_motu.csv"
tableNameFile = 'results/'+results_subfolders['otu_table']+'/'+pm+'/all.otu.csv'
newTableNameFile = 'results/'+thisProjet+'_'+thisMarker+'_table_motu.csv'
ecotagNameFile = 'results/'+results_subfolders['table_assigned_sequences']+'/'+pm+'/otu.tag.genbank.ann.sort.csv'
newEcotagNameFile = 'results/'+thisProjet+'_'+thisMarker+'_ecotag_ncbi_motu.csv'
print(thisProjet+"_"+thisMarker+", ", end='')
os.system("cp {0} {1}".format(tableNameFile, newTableNameFile))
os.system("cp {0} {1}".format(ecotagNameFile, newEcotagNameFile))
......
......@@ -4,17 +4,18 @@ __license__ = "MIT"
## concatenate {sample} fastq into {run} fastq
rule cat_quality:
input:
'results/07_flag_projmarkrun/{projmarkrun}.flag'
'results/'+results_subfolders['flag_projmarkrun']+'/{projmarkrun}.flag'
output:
'results/08_cat_quality/{projmarkrun}.fastq'
'results/'+results_subfolders['cat_quality']+'/{projmarkrun}.fastq'
params:
samples= lambda wildcards: " ".join(dfMultiProjMarkRunSample[dfMultiProjMarkRunSample.projMarkRun == wildcards.projmarkrun]['demultiplex'].to_list()),
dmulti= lambda wildcards: dfMultiProjMarkRun[dfMultiProjMarkRun.projMarkRun == wildcards.projmarkrun].to_dict('records')[0]
dmulti= lambda wildcards: dfMultiProjMarkRun[dfMultiProjMarkRun.projMarkRun == wildcards.projmarkrun].to_dict('records')[0],
resFolder='results/'+results_subfolders['cat_quality']
log:
'logs/08_cat_quality/{projmarkrun}.log'
'logs/'+results_subfolders['cat_quality']+'/{projmarkrun}.log'
shell:
'''
mkdir -p results/08_cat_quality/{params.dmulti[projet]}/{params.dmulti[marker]}
mkdir -p {params.resFolder}/{params.dmulti[projet]}/{params.dmulti[marker]}
cat {params.samples} > {output}
echo "cat "{params.samples}" > "{output} > {log}
'''
......
......@@ -5,9 +5,9 @@ __license__ = "MIT"
## we also check for duplicated and undetected barcodes
rule demultiplex_tag:
input:
'results/02_merge_fastq/{run}.fastq'
'results/'+results_subfolders['merge_fastq']+'/{run}.fastq'
output:
'results/03_demultiplex_tag/flags/{run}_demultiplex_tag.done'
'results/'+results_subfolders['demultiplex_tag']+'/flags/{run}_demultiplex_tag.done'
singularity:
config["singularity"]["ednatools"]
conda:
......@@ -19,7 +19,7 @@ rule demultiplex_tag:
dereplicated_csv='results/01_settings/barcodes/{run}_duplicated.csv',
minLen=config["demultiplexing"]["minLen"]
log:
'logs/03_demultiplex_tag/{run}.log'
'logs/'+results_subfolders['demultiplex_tag']+'/{run}.log'
shell:
'''
cutadapt -m {params.minLen} --revcomp -O 8 --discard-untrimmed -g file:{params.barcodes} {input} -o {{name}}.fastq > {log}
......
......@@ -4,9 +4,9 @@ __license__ = "MIT"
## Discard sequences containing Ns, convert to fasta
rule discard_iupac_ambiguous_sequences:
input:
'results/04_trim_primers/{demultiplex}.fastq'
'results/'+results_subfolders['trim_primers']+'/{demultiplex}.fastq'
output:
'results/05_discard_iupac_ambiguous_sequences/{demultiplex}.fasta'
'results/'+results_subfolders['discard_iupac_ambiguous_sequences']+'/{demultiplex}.fasta'
singularity:
config["singularity"]["ednatools"]
conda:
......@@ -16,12 +16,13 @@ rule discard_iupac_ambiguous_sequences:
resources:
job=1
params:
dmulti= lambda wildcards: dfMulti[dfMulti.demultiplex == wildcards.demultiplex].to_dict('records')[0]
dmulti= lambda wildcards: dfMulti[dfMulti.demultiplex == wildcards.demultiplex].to_dict('records')[0],
resFolder='results/'+results_subfolders['discard_iupac_ambiguous_sequences']
log:
'logs/05_discard_iupac_ambiguous_sequences/{demultiplex}.log'
'logs/'+results_subfolders['discard_iupac_ambiguous_sequences']+'/{demultiplex}.log'
shell:
'''
if [[ -s {input} ]]; then mkdir -p results/05_discard_iupac_ambiguous_sequences/{params.dmulti[projet]}/{params.dmulti[marker]};
if [[ -s {input} ]]; then mkdir -p {params.resFolder}/{params.dmulti[projet]}/{params.dmulti[marker]};
vsearch --fastq_filter {input} --fastq_maxns 0 --fastaout {output} 2> {log};
else touch {output}; echo "the input file {input} is empty." > {log};
fi
......
......@@ -4,11 +4,12 @@ __license__ = "MIT"
## Discard sequences containing Ns, keep to fastq format
rule filter_quality:
input:
'results/08_cat_quality/{projmarkrun}.fastq'
'results/'+results_subfolders['cat_quality']+'/{projmarkrun}.fastq'
output:
'results/09_filter_quality/{projmarkrun}.fastq'
'results/'+results_subfolders['filter_quality']+'/{projmarkrun}.fastq'
params:
dmulti= lambda wildcards: dfMultiProjMarkRun[dfMultiProjMarkRun.projMarkRun == wildcards.projmarkrun].to_dict('records')[0],
resFolder='results/'+results_subfolders['filter_quality']
singularity:
config["singularity"]["ednatools"]
conda:
......@@ -16,9 +17,9 @@ rule filter_quality:
threads:
1
log:
'logs/09_filter_quality/{projmarkrun}.log'
'logs/'+results_subfolders['filter_quality']+'/{projmarkrun}.log'
shell:
'''
mkdir -p results/09_filter_quality/{params.dmulti[projet]}/{params.dmulti[marker]}
mkdir -p {params.resFolder}/{params.dmulti[projet]}/{params.dmulti[marker]}
vsearch --fastq_filter {input} --threads {threads} --fastq_maxns 0 --relabel_sha1 --eeout --fastqout {output} 2> {log}
'''
\ No newline at end of file
......@@ -4,17 +4,18 @@ __license__ = "MIT"
## Discard quality lines, extract hash, expected error rates and read length
rule hash_quality:
input:
'results/09_filter_quality/{projmarkrun}.fastq'
'results/'+results_subfolders['filter_quality']+'/{projmarkrun}.fastq'
output:
'results/10_hash_quality/{projmarkrun}.tmp'
'results/'+results_subfolders['hash_quality']+'/{projmarkrun}.tmp'
params:
dmulti= lambda wildcards: dfMultiProjMarkRun[dfMultiProjMarkRun.projMarkRun == wildcards.projmarkrun].to_dict('records')[0],
resFolder='results/'+results_subfolders['hash_quality']
threads:
1
log:
'logs/10_hash_quality/{projmarkrun}.log'
'logs/'+results_subfolders['hash_quality']+'/{projmarkrun}.log'
shell:
'''
mkdir -p results/10_hash_quality/{params.dmulti[projet]}/{params.dmulti[marker]}
mkdir -p {params.resFolder}/{params.dmulti[projet]}/{params.dmulti[marker]}
sed 'n;n;N;d' {input} | awk -f scripts/quality.awk | tr -d "@" > {output} 2> {log};
'''
\ No newline at end of file
......@@ -8,7 +8,7 @@ rule merge_fastq:
R1=config["fichiers"]["folder_fastq"]+'{run}_R1.fastq.gz',
R2=config["fichiers"]["folder_fastq"]+'{run}_R2.fastq.gz'
output:
fq='results/02_merge_fastq/{run}.fastq'
fq='results/'+results_subfolders['merge_fastq']+'/{run}.fastq'
singularity:
config["singularity"]["ednatools"]
conda:
......@@ -18,7 +18,7 @@ rule merge_fastq:
resources:
job=1
log:
'logs/02_merge_fastq/{run}.log'
'logs/'+results_subfolders['merge_fastq']+'/{run}.log'
params:
encoding=config["merge_fastq"]["encoding"]
shell:
......
......@@ -4,11 +4,15 @@ __license__ = "MIT"
## OTU fasta
rule otu_fasta:
input:
'results/19_otu_table/{projmark}/all.otu.csv'
'results/'+results_subfolders['otu_table']+'/{projmark}/all.otu.csv'
output:
'results/20_otu_fasta/{projmark}/all.otu.fasta'
'results/'+results_subfolders['otu_fasta']+'/{projmark}/all.otu.fasta'
params:
resFolder='results/'+results_subfolders['otu_fasta']
log:
'logs/'+results_subfolders['otu_fasta']+'/{projmark}/otu_to_fasta.log'
shell:
'''
mkdir -p results/20_otu_fasta/{wildcards.projmark}
python3 scripts/OTU_table2fasta.py {input} {output}
mkdir -p {params.resFolder}/{wildcards.projmark}
python3 scripts/OTU_table2fasta.py {input} {output} 2> {log}
'''
......@@ -4,22 +4,23 @@ __license__ = "MIT"
## OTU table
rule otu_table:
input:
'results/18_remove_chimera_sequences/{projmark}/representatives.uchime'
'results/'+results_subfolders['remove_chimera_sequences']+'/{projmark}/representatives.uchime'
output:
'results/19_otu_table/{projmark}/all.otu.csv'
'results/'+results_subfolders['otu_table']+'/{projmark}/all.otu.csv'
params:
representatives='results/17_sort_representatives/{projmark}/representatives.sorted.fasta',
stats='results/16_projetmarker_cluster_sequences/{projmark}/all.stats',
swarm='results/16_projetmarker_cluster_sequences/{projmark}/all.swarm',
qual='results/13_projetmarker_cat_quality/{projmark}/all.qual',
assignment='results/18_remove_chimera_sequences/{projmark}/representatives.assignment'
representatives='results/'+results_subfolders['sort_representatives']+'/{projmark}/representatives.sorted.fasta',
stats='results/'+results_subfolders['projetmarker_cluster_sequences']+'/{projmark}/all.stats',
swarm='results/'+results_subfolders['projetmarker_cluster_sequences']+'/{projmark}/all.swarm',
qual='results/'+results_subfolders['projetmarker_cat_quality']+'/{projmark}/all.qual',
assignment='results/'+results_subfolders['remove_chimera_sequences']+'/{projmark}/representatives.assignment',
resFolder='results/'+results_subfolders['otu_table']
log:
'logs/19_otu_table/{projmark}/otu_contingency_table.log'
'logs/'+results_subfolders['otu_table']+'/{projmark}/otu_contingency_table.log'
threads:
1
shell:
'''
mkdir -p results/19_otu_table/{wildcards.projmark}
mkdir -p {params.resFolder}/{wildcards.projmark}
python3 scripts/OTU_contingency_table.py \
{params.representatives} \
{params.stats} \
......
......@@ -4,14 +4,16 @@ __license__ = "MIT"
## Pool sequences and quality files
rule projetmarker_cat_fasta:
input:
'results/13_projetmarker_cat_quality/{projmark}/all.qual'
'results/'+results_subfolders['projetmarker_cat_quality']+'/{projmark}/all.qual'
output:
'results/14_projetmarker_cat_fasta/{projmark}/all.fasta'
'results/'+results_subfolders['projetmarker_cat_fasta']+'/{projmark}/all.fasta'
threads:
1
params:
resFolder='results/'+results_subfolders['projetmarker_cat_fasta']
shell:
'''
mkdir -p results/14_projetmarker_cat_fasta/{wildcards.projmark}
mkdir -p {params.resFolder}/{wildcards.projmark}
cat results/06_sample_dereplicate_sequences/{wildcards.projmark}/*.fasta > {output}
'''
......@@ -4,14 +4,16 @@ __license__ = "MIT"
## Pool sequences and quality files
rule projetmarker_cat_quality:
input:
'results/12_flag_projmark/{projmark}/flag'
'results/'+results_subfolders['flag_projmark']+'/{projmark}/flag'
output:
'results/13_projetmarker_cat_quality/{projmark}/all.qual'
'results/'+results_subfolders['projetmarker_cat_quality']+'/{projmark}/all.qual'
params:
resFolder='results/'+results_subfolders['projetmarker_cat_quality']
threads:
1
shell:
'''
mkdir -p results/13_projetmarker_cat_quality/{wildcards.projmark}
mkdir -p {params.resFolder}/{wildcards.projmark}
cat results/11_sort_quality/{wildcards.projmark}/*.qual > {output}
'''
......@@ -4,9 +4,9 @@ __license__ = "MIT"
## Clustering
rule projetmarker_cluster_sequences:
input:
'results/15_projetmarker_dereplicate_sequences/{projmark}/all.uniq.fasta'
'results/'+results_subfolders['projetmarker_dereplicate_sequences']+'/{projmark}/all.uniq.fasta'
output:
'results/16_projetmarker_cluster_sequences/{projmark}/representatives.tmp'
'results/'+results_subfolders['projetmarker_cluster_sequences']+'/{projmark}/representatives.tmp'
threads:
config["clustering"]["swarm"]["cores"]
singularity:
......@@ -14,13 +14,14 @@ rule projetmarker_cluster_sequences:
conda:
'../envs/env_swarm2.yaml'
params:
struct='results/16_projetmarker_cluster_sequences/{projmark}/all.struct',
stats='results/16_projetmarker_cluster_sequences/{projmark}/all.stats',
swarm='results/16_projetmarker_cluster_sequences/{projmark}/all.swarm'
struct='results/'+results_subfolders['projetmarker_cluster_sequences']+'/{projmark}/all.struct',
stats='results/'+results_subfolders['projetmarker_cluster_sequences']+'/{projmark}/all.stats',
swarm='results/'+results_subfolders['projetmarker_cluster_sequences']+'/{projmark}/all.swarm',
resFolder='results/'+results_subfolders['projetmarker_cluster_sequences']
log:
'logs/16_projetmarker_cluster_sequences/{projmark}/swarm.log'
'logs/'+results_subfolders['projetmarker_cluster_sequences']+'/{projmark}/swarm.log'
shell:
'''
mkdir -p results/16_projetmarker_cluster_sequences/{wildcards.projmark}
mkdir -p {params.resFolder}/{wildcards.projmark}
swarm -d 1 -f -t {threads} -z -i {params.struct} -s {params.stats} -w {output} -o {params.swarm} < {input} 2> {log}
'''
\ No newline at end of file
......@@ -4,20 +4,22 @@ __license__ = "MIT"
## Dereplicate and fix name of sequences (have to end with ';')
rule projetmarker_dereplicate_sequences:
input:
'results/14_projetmarker_cat_fasta/{projmark}/all.fasta'
'results/'+results_subfolders['projetmarker_cat_fasta']+'/{projmark}/all.fasta'
output:
'results/15_projetmarker_dereplicate_sequences/{projmark}/all.uniq.fasta'
'results/'+results_subfolders['projetmarker_dereplicate_sequences']+'/{projmark}/all.uniq.fasta'
threads:
1
log:
'logs/15_projetmarker_dereplicate_sequences/{projmark}/vsearch.log'
'logs/'+results_subfolders['projetmarker_dereplicate_sequences']+'/{projmark}/vsearch.log'
singularity:
config["singularity"]["ednatools"]
conda:
'../envs/env_vsearch.yaml'
params:
resFolder='results/'+results_subfolders['projetmarker_dereplicate_sequences']
shell:
'''
mkdir -p results/15_projetmarker_dereplicate_sequences/{wildcards.projmark}
mkdir -p {params.resFolder}/{wildcards.projmark}
vsearch --derep_fulllength {input} --sizein --sizeout --fasta_width 0 --output {output} > /dev/null 2> {log}
sed -i '1~2 s/$/;/g' {output}
'''
\ No newline at end of file
......@@ -2,22 +2,23 @@ __author__ = "Pierre-Edouard Guerin"
__license__ = "MIT"
## Some unuseful attributes can be removed at this stage
rule annotate_assign:
rule remove_annotations:
input:
'results/21_taxonomic_assignment/{projmark}/otu.tag.genbank.fasta'
'results/'+results_subfolders['taxonomic_assignment']+'/{projmark}/otu.tag.genbank.fasta'
output:
'results/22_remove_annotations/{projmark}/otu.tag.genbank.ann.fasta'
'results/'+results_subfolders['remove_annotations']+'/{projmark}/otu.tag.genbank.ann.fasta'
params:
dmulti= lambda wildcards: dfMultiProjMark[dfMultiProjMark.projMark == wildcards.projmark].to_dict('records')[0],
resFolder='results/'+results_subfolders['remove_annotations']
log:
'logs/22_remove_annotations/{projmark}/obiannotate.log'
'logs/'+results_subfolders['remove_annotations']+'/{projmark}/obiannotate.log'
singularity:
config["singularity"]["obitools"]
conda:
'../envs/env_obitools.yaml'
shell:
'''
mkdir -p results/22_remove_annotations/{wildcards.projmark}
mkdir -p {params.resFolder}/{wildcards.projmark}
obiannotate --delete-tag=scientific_name_by_db --delete-tag=obiclean_samplecount \
--delete-tag=obiclean_count --delete-tag=obiclean_singletoncount \
--delete-tag=obiclean_cluster --delete-tag=obiclean_internalcount \
......
......@@ -4,11 +4,12 @@ __license__ = "MIT"
## Chimera checking
rule remove_chimera_sequences:
input:
'results/17_sort_representatives/{projmark}/representatives.sorted.fasta'
'results/'+results_subfolders['sort_representatives']+'/{projmark}/representatives.sorted.fasta'
output:
'results/18_remove_chimera_sequences/{projmark}/representatives.uchime'
'results/'+results_subfolders['remove_chimera_sequences']+'/{projmark}/representatives.uchime'
params:
assignment='results/18_remove_chimera_sequences/{projmark}/representatives.assignment'