Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
edna
snakemake_rapidrun_obitools
Commits
c8fde049
Commit
c8fde049
authored
Sep 16, 2020
by
peguerin
Browse files
snakemake handle CLASSIC format
parent
b7a009f5
Changes
6
Hide whitespace changes
Inline
Side-by-side
01_settings/readwrite_rapidrun_demultiplexing.py
View file @
c8fde049
...
...
@@ -4,7 +4,7 @@
__author__
=
"Pierre-Edouard Guerin"
__credits__
=
[
"Pierre-Edouard Guerin"
,
"Virginie Marques"
]
__license__
=
"MIT"
__version__
=
"1.1.
1
"
__version__
=
"1.1.
4
"
__maintainer__
=
"Pierre-Edouard Guerin"
__email__
=
"pierre-edouard.guerin@cefe.cnrs.fr"
__status__
=
"Production"
...
...
@@ -40,18 +40,42 @@ from Bio.Seq import Seq
## read a sample description .dat file and return a dataframe object
def
read_dat
(
filedat
):
dfdat
=
pandas
.
read_csv
(
filedat
,
sep
=
"
\t
"
,
header
=
None
)
dfdat
.
columns
=
[
'
plaque
'
,
'plaque
1
'
,
'barcode'
,
'primer5'
,
'primer3'
,
'F'
]
dfdat
=
pandas
.
read_csv
(
filedat
,
sep
=
"
\t
"
,
header
=
None
)
dfdat
.
columns
=
[
'
experiment
'
,
'plaque'
,
'barcode'
,
'primer5'
,
'primer3'
,
'F'
]
return
dfdat
###############################################################################
# GLOBAL VARIABLES
###############################################################################
#configfile: "01_infos/config.yaml"
## check format (CLASSIC or RAPIDRUN)
if
config
[
'format'
]
==
"CLASSIC"
:
print
(
"CLASSIC data: one single marker for each run"
)
dfrClassic
=
pandas
.
DataFrame
(
columns
=
[
'plaque'
,
'run'
,
'sample'
,
'projet'
,
'marker'
])
for
run
in
config
[
'fichiers'
][
'dat'
]:
thisRunDatfile
=
config
[
'fichiers'
][
'dat'
][
run
]
thisDat
=
read_dat
(
thisRunDatfile
)
for
index
,
datRow
in
thisDat
.
iterrows
():
thisRow
=
{
"plaque"
:
datRow
[
'plaque'
],
"run"
:
run
,
"sample"
:
datRow
[
'plaque'
],
"projet"
:
datRow
[
'experiment'
],
"marker"
:
run
}
dfrClassic
=
dfrClassic
.
append
(
thisRow
,
ignore_index
=
True
)
print
(
dfrClassic
)
export_allsample
=
dfrClassic
.
to_csv
(
r
'../results/01_settings/all_samples_classic.csv'
,
index
=
None
,
header
=
False
,
sep
=
';'
)
rapidrunfile
=
"../results/01_settings/all_samples_classic.csv"
else
:
print
(
"RAPIDRUN data: many markers for many runs"
)
#configfile: "01_infos/config.yaml"
rapidrunfile
=
config
[
'fichiers'
][
'rapidrun'
]
#rapidrunfile="01_infos/all_samples.tsv"
rapidrunfile
=
config
[
'fichiers'
][
'rapidrun'
]
#rapidrunfile="01_infos/all_samples.tsv"
## read 'rapidrun' .tsv file
dfr
=
pandas
.
read_csv
(
rapidrunfile
,
sep
=
";"
)
...
...
02_assembly/Snakefile
View file @
c8fde049
...
...
@@ -4,7 +4,7 @@
__author__ = "Pierre-Edouard Guerin"
__credits__ = ["Pierre-Edouard Guerin", "Virginie Marques"]
__license__ = "MIT"
__version__ = "1.1.
1
"
__version__ = "1.1.
4
"
__maintainer__ = "Pierre-Edouard Guerin"
__email__ = "pierre-edouard.guerin@cefe.cnrs.fr"
__status__ = "Production"
...
...
@@ -35,6 +35,7 @@ Results are stored into results/02_assembly/
# MODULES
###############################################################################
import pandas
import os.path
###############################################################################
...
...
@@ -44,7 +45,13 @@ import pandas
#configfile: "01_infos/config.yaml"
#rapidrunfile = "../"+config['fichiers']['rapidrun']
rapidrunfile = config['fichiers']['rapidrun']
if config['format'] != "CLASSIC":
rapidrunfile = config['fichiers']['rapidrun']
else:
rapidrunfile="../results/01_settings/all_samples_classic.csv"
if os.path.isfile(rapidrunfile) is not True:
raise Exception("ERROR: "+rapidrunfile+" is not a file. You must run step 01_settings first in order to generate this file for the CLASSIC format.")
## read the rapidrun file as dataframe
dfr =pandas.read_csv(rapidrunfile, sep=";")
...
...
03_demultiplex/Snakefile
View file @
c8fde049
...
...
@@ -4,7 +4,7 @@
__author__ = "Pierre-Edouard Guerin"
__credits__ = ["Pierre-Edouard Guerin", "Virginie Marques"]
__license__ = "MIT"
__version__ = "1.1.
1
"
__version__ = "1.1.
4
"
__maintainer__ = "Pierre-Edouard Guerin"
__email__ = "pierre-edouard.guerin@cefe.cnrs.fr"
__status__ = "Production"
...
...
@@ -59,8 +59,13 @@ def str_join(df, sep, *cols):
## load demultiplexing dataframe
dfMulti =pandas.read_csv("../results/01_settings/all_demultiplex.csv", sep=",")
## load rapidrun.tsv file
rapidrunfile = config['fichiers']['rapidrun']
dfr =pandas.read_csv(rapidrunfile, sep="\t")
if config['format'] != "CLASSIC":
rapidrunfile = config['fichiers']['rapidrun']
else:
rapidrunfile="../results/01_settings/all_samples_classic.csv"
if os.path.isfile(rapidrunfile) is not True:
raise Exception("ERROR: "+rapidrunfile+" is not a file. You must run step 01_settings first in order to generate this file for the CLASSIC format.")
dfr =pandas.read_csv(rapidrunfile, sep=";")
dfr.columns = ['plaque', 'run','sample','projet','marker']
## remove blacklisted projets
blacklistedProjets = config['blacklist']['projet']
...
...
04_filter_samples/Snakefile
View file @
c8fde049
...
...
@@ -4,7 +4,7 @@
__author__ = "Pierre-Edouard Guerin"
__credits__ = ["Pierre-Edouard Guerin", "Virginie Marques"]
__license__ = "MIT"
__version__ = "1.1.
1
"
__version__ = "1.1.
4
"
__maintainer__ = "Pierre-Edouard Guerin"
__email__ = "pierre-edouard.guerin@cefe.cnrs.fr"
__status__ = "Production"
...
...
@@ -71,7 +71,7 @@ rule all:
expand('../logs/04_filter_samples/02_goodlength/{demultiplexs}.log', demultiplexs=dfMultiChecked['demultiplex']),
expand('../logs/04_filter_samples/03_clean_pcrerr/{demultiplexs}.log', demultiplexs=dfMultiChecked['demultiplex']),
expand('../logs/04_filter_samples/04_filtered/{demultiplexs}.log', demultiplexs=dfMultiChecked['demultiplex'])
include: "rules/dereplicate_samples.smk"
include: "rules/goodlength_samples.smk"
include: "rules/clean_pcrerr_samples.smk"
...
...
05_assignment/Snakefile
View file @
c8fde049
...
...
@@ -4,7 +4,7 @@
__author__ = "Pierre-Edouard Guerin"
__credits__ = ["Pierre-Edouard Guerin", "Virginie Marques"]
__license__ = "MIT"
__version__ = "1.1.
1
"
__version__ = "1.1.
4
"
__maintainer__ = "Pierre-Edouard Guerin"
__email__ = "pierre-edouard.guerin@cefe.cnrs.fr"
__status__ = "Production"
...
...
@@ -53,10 +53,18 @@ for pmr in projetMarkerRuns.projmarkrun:
file_sample = "../results/05_assignment/01_runs/"+pmr+".fasta"
if not os.path.exists(file_sample):
print("WARNING: ", file_sample," not found. We removed it from this analysis.")
dfpmr = dfpmr[dfpmr.projmarkrun != pr
ojmarkrun
]
dfpmr = dfpmr[dfpmr.projmarkrun != p
m
r]
## attribute sample description files to each row with corresponding `marker`
dfpmr['bdr'] = dfpmr['marker'].map(config["assign_taxon"]["bdr"])
dfpmr['fasta'] = dfpmr['marker'].map(config["assign_taxon"]["fasta"])
if config['format'] == "CLASSIC":
thisMarker=str(list(config["assign_taxon"]["bdr"])[0])
markerDic = {}
for dmarker in dfpmr['marker']:
markerDic[dmarker] = thisMarker
dfpmr['bdr'] = dfpmr['marker'].map(markerDic).map(config["assign_taxon"]["bdr"])
dfpmr['fasta'] = dfpmr['marker'].map(markerDic).map(config["assign_taxon"]["fasta"])
else:
dfpmr['bdr'] = dfpmr['marker'].map(config["assign_taxon"]["bdr"])
dfpmr['fasta'] = dfpmr['marker'].map(config["assign_taxon"]["fasta"])
## display selected `projet`/`marker`/`run` with related information
print(dfpmr)
...
...
main.sh
View file @
c8fde049
...
...
@@ -35,6 +35,7 @@ CONFIGFILE=$2
#CONFIGFILE="01_infos/config_test.yaml"
#CONFIGFILE="01_infos/config_laperouse_alsace.yaml"
#CONFIGFILE="config/config_tutorial_rapidrun.yaml"
#CONFIGFILE="config/config_tutorial_classic.yaml"
###############################################################################
## write demultiplex table
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment