Commit e4c5544b authored by peguerin's avatar peguerin
Browse files

reinit

parent 38a72523
......@@ -40,5 +40,20 @@ cat $NO_SEQ_BOLD $GEONAMES_ASSIGNED_REG_BOLD > $ALL_FORMAT_BOLD
COI_BOLD=${ALL_FORMAT_BOLD/.tsv/_coi.tsv}
awk -F "\t" '{ if($70 == "COI-5P" || $70 == "markercode") print $0 }' $ALL_FORMAT_BOLD > $COI_BOLD
COI_LENGHT_BOLD=${COI_BOLD/.tsv/_length499.tsv}
awk -F "\t" 'NR == 1 {print; next} { if(length($72) > 499) print $0 }' $COI_BOLD > $COI_LENGHT_BOLD
\ No newline at end of file
## keep sequence with a length greater than 420 bases
COI_LENGHT_BOLD=${COI_BOLD/.tsv/_length420.tsv}
awk -F "\t" 'NR == 1 {print; next} { if(length($72) > 420) print $0 }' $COI_BOLD > $COI_LENGHT_BOLD
## keep species with at least 2 sequences
SPECIES_WITH_A_SINGLE_SEQUENCE="02-bold_format/species_with_a_single_sequence.txt"
COI_2SEQ_SPECIES=${COI_LENGHT_BOLD/.tsv/_atleast2seqbyspecies.tsv}
sort -k22 -t$'\t' $COI_LENGHT_BOLD | awk -F "\t" '{ print $22 }' | uniq -c | awk '{ if($1 < 2) print $2" "$3 }' > $SPECIES_WITH_A_SINGLE_SEQUENCE
grep -wvf $SPECIES_WITH_A_SINGLE_SEQUENCE $COI_LENGHT_BOLD > $COI_2SEQ_SPECIES
sort -k22 -t$'\t' $COI_LENGHT_BOLD | awk -F "\t" '{ print $22 }' | uniq -c | awk '{ if($1 < 2) print $2" "$3" "$4" "$5 }'
sed 's/ /[[:space:]]/g' $SPECIES_WITH_A_SINGLE_SEQUENCE
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
......@@ -3,17 +3,58 @@
Here we go again to investigate genetic patterns at different scale
# Prerequisites
# Installation
* python2
## Prerequisites
* python3
* bash
* lynx
* awk
### CONDA
Instructions to install CONDA are [here](https://docs.conda.io/projects/conda/en/latest/user-guide/install/linux.html)
### JUPYTER NOTEBOOK
* install jupyter notebook using CONDA
```
conda install -c conda-forge notebook
```
### R
```
conda env create -f conda/env_ubuntu.yml
conda activate mapmarine2
```
If you haven’t done this already, you will have to make Jupyter see the newly installed R kernel by installing a kernel spec. The kernel spec can be installed for the current user with the following line from R:
```
IRkernel::installspec()
```
### geogendiv
Into R session:
```
devtools::install_github("grelot/geogendivr", force = TRUE)
```
# 1 Donwload and format BOLD database
1. I type the word 'actinopterygii' on [BOLD website](http://www.boldsystems.org).
1. I seek the term 'actinopterygii' on [BOLD website](http://www.boldsystems.org).
2. I download Combined TSV files on **monday the 6th january 2020**.
3. The original file is stored into [01-bold_raw](01-bold_raw) folder.
......@@ -26,9 +67,8 @@ Here we go again to investigate genetic patterns at different scale
4. File with COI georeferenced sequences is stored as [02-bold_format/all_format_bold_coi.tsv](02-bold_format/all_format_bold_coi.tsv)
```
bash 00-scripts/step1/format_bold.sh
bash 00-scripts/step2/format_bold.sh
```
# Cluster individual sequences by region/tile
# 3
name: mapmarine2
channels:
- r
- conda-forge
- conda-forge/label/gcc7
- bioconda
dependencies:
- jupyter=1.0.0
- r-taxize=0.9.91
- r-irkernel=0.8.12
- r-rgdal=1.4.4
- r-rgeos=0.5.1
- r-dplyr=0.8.3
- r-leaflet=2.0.3
- r-htmlwidgets=1.3
- r-htmltools=0.3.6
- r-sf=0.8
- r-classint=0.4.2
- r-tidyverse=1.2.1
- r-xfun=0.5
- r-biocmanager=1.30.4
- r-bold=0.9.0
- bioconductor-sangerseqr=1.22.0
name: mapmarine2
channels:
- conda-forge
- conda-forge/label/gcc7
- bioconda
- bioconda/label/gcc7
- r
dependencies:
- libblas
- r=3.6.*
- r-rcpp=1.0.1
- r-geonames=0.999
- r-data.table=1.12.2
- r-stringi=1.4.5
- r-mclust=5.4.5
- bioconductor-preprocesscore=1.48.0
- r-classint 0.4.1
- r-taxize=0.9.*
- r-rgdal=1.4.4
- r-rgeos=0.5.1
- r-dplyr=0.8.3
- r-sf=0.8
- r-tidyverse=1.2.1
- r-biocmanager=1.30.4
- r-bold=0.9.0
- bioconductor-sangerseqr=1.22.0
- r-ggplot2=3.1.1
- muscle=3.8.1551
- r-raster=3.*
- r-countrycode=1.1.0
- r-mass
- r-lme4
- r-shape
- r-visreg
- r-betareg=3.*
- r-spatial=7.*
- r-hier.part
- r-gridextra=2.3
......@@ -6,5 +6,7 @@ no seq & no spec & notlatlon 02-bold_format/bold_nolatlon_nospec_noseq.tsv 10089
add geonames assignment 02-bold_format/all_format_bold.tsv 102599
select COI sequences 02-bold_format/all_format_bold_coi.tsv 97146
select 499 baseslength sequences 02-bold_format/all_format_bold_coi_length499.tsv 95609
select 420 baseslength sequences 02-bold_format/all_format_bold_coi_length420.tsv 96127
keep species with a least 2 sequences 91168
library("bold")
taxonRequest <- "Actinopterygii"
resBold <- bold_seqspec(taxon=taxonRequest, sepfasta=TRUE)
## Saving object in RData format
resboldObjFile=paste("rdata/bold_res_",taxonRequest,".RData",sep="")
save(resBold, file = resboldObjFile)
## load request bold data sequence+specimens
taxonRequest <- "Actinopterygii"
resboldObjFile=paste("rdata/bold_res_",taxonRequest,".RData",sep="")
load(resboldObjFile)
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment