Commit c8fde049 authored by peguerin's avatar peguerin
Browse files

snakemake handle CLASSIC format

parent b7a009f5
......@@ -4,7 +4,7 @@
__author__ = "Pierre-Edouard Guerin"
__credits__ = ["Pierre-Edouard Guerin", "Virginie Marques"]
__license__ = "MIT"
__version__ = "1.1.1"
__version__ = "1.1.4"
__maintainer__ = "Pierre-Edouard Guerin"
__email__ = "pierre-edouard.guerin@cefe.cnrs.fr"
__status__ = "Production"
......@@ -40,18 +40,42 @@ from Bio.Seq import Seq
## read a sample description .dat file and return a dataframe object
def read_dat(filedat):
dfdat = pandas.read_csv(filedat,sep="\t",header=None)
dfdat.columns=['plaque','plaque1','barcode','primer5','primer3','F']
dfdat = pandas.read_csv(filedat, sep="\t", header=None)
dfdat.columns=['experiment','plaque','barcode','primer5','primer3','F']
return dfdat
###############################################################################
# GLOBAL VARIABLES
###############################################################################
#configfile: "01_infos/config.yaml"
## check format (CLASSIC or RAPIDRUN)
if config['format'] == "CLASSIC":
print("CLASSIC data: one single marker for each run")
dfrClassic = pandas.DataFrame(columns=['plaque','run','sample','projet','marker'])
for run in config['fichiers']['dat']:
thisRunDatfile=config['fichiers']['dat'][run]
thisDat=read_dat(thisRunDatfile)
for index, datRow in thisDat.iterrows():
thisRow = {
"plaque": datRow['plaque'],
"run": run,
"sample": datRow['plaque'],
"projet": datRow['experiment'],
"marker": run
}
dfrClassic = dfrClassic.append(thisRow, ignore_index=True)
print(dfrClassic)
export_allsample = dfrClassic.to_csv (r'../results/01_settings/all_samples_classic.csv', index = None, header = False, sep = ';')
rapidrunfile="../results/01_settings/all_samples_classic.csv"
else:
print("RAPIDRUN data: many markers for many runs")
#configfile: "01_infos/config.yaml"
rapidrunfile = config['fichiers']['rapidrun']
#rapidrunfile="01_infos/all_samples.tsv"
rapidrunfile = config['fichiers']['rapidrun']
#rapidrunfile="01_infos/all_samples.tsv"
## read 'rapidrun' .tsv file
dfr =pandas.read_csv(rapidrunfile, sep=";")
......
......@@ -4,7 +4,7 @@
__author__ = "Pierre-Edouard Guerin"
__credits__ = ["Pierre-Edouard Guerin", "Virginie Marques"]
__license__ = "MIT"
__version__ = "1.1.1"
__version__ = "1.1.4"
__maintainer__ = "Pierre-Edouard Guerin"
__email__ = "pierre-edouard.guerin@cefe.cnrs.fr"
__status__ = "Production"
......@@ -35,6 +35,7 @@ Results are stored into results/02_assembly/
# MODULES
###############################################################################
import pandas
import os.path
###############################################################################
......@@ -44,7 +45,13 @@ import pandas
#configfile: "01_infos/config.yaml"
#rapidrunfile = "../"+config['fichiers']['rapidrun']
rapidrunfile = config['fichiers']['rapidrun']
if config['format'] != "CLASSIC":
rapidrunfile = config['fichiers']['rapidrun']
else:
rapidrunfile="../results/01_settings/all_samples_classic.csv"
if os.path.isfile(rapidrunfile) is not True:
raise Exception("ERROR: "+rapidrunfile+" is not a file. You must run step 01_settings first in order to generate this file for the CLASSIC format.")
## read the rapidrun file as dataframe
dfr =pandas.read_csv(rapidrunfile, sep=";")
......
......@@ -4,7 +4,7 @@
__author__ = "Pierre-Edouard Guerin"
__credits__ = ["Pierre-Edouard Guerin", "Virginie Marques"]
__license__ = "MIT"
__version__ = "1.1.1"
__version__ = "1.1.4"
__maintainer__ = "Pierre-Edouard Guerin"
__email__ = "pierre-edouard.guerin@cefe.cnrs.fr"
__status__ = "Production"
......@@ -59,8 +59,13 @@ def str_join(df, sep, *cols):
## load demultiplexing dataframe
dfMulti =pandas.read_csv("../results/01_settings/all_demultiplex.csv", sep=",")
## load rapidrun.tsv file
rapidrunfile = config['fichiers']['rapidrun']
dfr =pandas.read_csv(rapidrunfile, sep="\t")
if config['format'] != "CLASSIC":
rapidrunfile = config['fichiers']['rapidrun']
else:
rapidrunfile="../results/01_settings/all_samples_classic.csv"
if os.path.isfile(rapidrunfile) is not True:
raise Exception("ERROR: "+rapidrunfile+" is not a file. You must run step 01_settings first in order to generate this file for the CLASSIC format.")
dfr =pandas.read_csv(rapidrunfile, sep=";")
dfr.columns = ['plaque', 'run','sample','projet','marker']
## remove blacklisted projets
blacklistedProjets = config['blacklist']['projet']
......
......@@ -4,7 +4,7 @@
__author__ = "Pierre-Edouard Guerin"
__credits__ = ["Pierre-Edouard Guerin", "Virginie Marques"]
__license__ = "MIT"
__version__ = "1.1.1"
__version__ = "1.1.4"
__maintainer__ = "Pierre-Edouard Guerin"
__email__ = "pierre-edouard.guerin@cefe.cnrs.fr"
__status__ = "Production"
......@@ -71,7 +71,7 @@ rule all:
expand('../logs/04_filter_samples/02_goodlength/{demultiplexs}.log', demultiplexs=dfMultiChecked['demultiplex']),
expand('../logs/04_filter_samples/03_clean_pcrerr/{demultiplexs}.log', demultiplexs=dfMultiChecked['demultiplex']),
expand('../logs/04_filter_samples/04_filtered/{demultiplexs}.log', demultiplexs=dfMultiChecked['demultiplex'])
include: "rules/dereplicate_samples.smk"
include: "rules/goodlength_samples.smk"
include: "rules/clean_pcrerr_samples.smk"
......
......@@ -4,7 +4,7 @@
__author__ = "Pierre-Edouard Guerin"
__credits__ = ["Pierre-Edouard Guerin", "Virginie Marques"]
__license__ = "MIT"
__version__ = "1.1.1"
__version__ = "1.1.4"
__maintainer__ = "Pierre-Edouard Guerin"
__email__ = "pierre-edouard.guerin@cefe.cnrs.fr"
__status__ = "Production"
......@@ -53,10 +53,18 @@ for pmr in projetMarkerRuns.projmarkrun:
file_sample = "../results/05_assignment/01_runs/"+pmr+".fasta"
if not os.path.exists(file_sample):
print("WARNING: ", file_sample," not found. We removed it from this analysis.")
dfpmr = dfpmr[dfpmr.projmarkrun != projmarkrun]
dfpmr = dfpmr[dfpmr.projmarkrun != pmr]
## attribute sample description files to each row with corresponding `marker`
dfpmr['bdr'] = dfpmr['marker'].map(config["assign_taxon"]["bdr"])
dfpmr['fasta'] = dfpmr['marker'].map(config["assign_taxon"]["fasta"])
if config['format'] == "CLASSIC":
thisMarker=str(list(config["assign_taxon"]["bdr"])[0])
markerDic = {}
for dmarker in dfpmr['marker']:
markerDic[dmarker] = thisMarker
dfpmr['bdr'] = dfpmr['marker'].map(markerDic).map(config["assign_taxon"]["bdr"])
dfpmr['fasta'] = dfpmr['marker'].map(markerDic).map(config["assign_taxon"]["fasta"])
else:
dfpmr['bdr'] = dfpmr['marker'].map(config["assign_taxon"]["bdr"])
dfpmr['fasta'] = dfpmr['marker'].map(config["assign_taxon"]["fasta"])
## display selected `projet`/`marker`/`run` with related information
print(dfpmr)
......
......@@ -35,6 +35,7 @@ CONFIGFILE=$2
#CONFIGFILE="01_infos/config_test.yaml"
#CONFIGFILE="01_infos/config_laperouse_alsace.yaml"
#CONFIGFILE="config/config_tutorial_rapidrun.yaml"
#CONFIGFILE="config/config_tutorial_classic.yaml"
###############################################################################
## write demultiplex table
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment