Commit 177af864 authored by eboulanger's avatar eboulanger
Browse files

red sunfish to bravo chicken

parent 060a0f16
......@@ -25,4 +25,22 @@ Keep only nonsynonymous variants
```
bash keep_nonsynonymous.sh
```
\ No newline at end of file
```
_____________________________________________________________
## Revision
Organise folders as follows:
```
.
├── annotation -> genome annotations
├── diplodus -> output files for D. sargus
├── filtered -> filtered SNP input files
├── genomes -> genomes
├── mullus -> output files for M. surmuletus
└── raw -> raw SNP input files
```
......@@ -13,43 +13,64 @@ tbl_dip_pcaupns <- read.table("../../seaConnect--radFishComp/200_genome_BLAST/04
tbl_mul_cds <- read.table("mullus/mullus_CDS_pos.txt")
tbl_mul_cdsns <- read.table("mullus/mullus_CDS_nonsynonymous_pos.txt")
tbl_mul_pca <- read.table("mullus/mullus_pcadapt_pos.txt")
tbl_mul_pcaupns <- read.table("../../seaConnect--radFishComp/200_genome_BLAST/04-filter_uniprot/mullus_CDS_uniprot_nonsynonymous.tsv")
# create new ID
dip_cds <- paste(tbl_dip_cds$V1, tbl_dip_cds$V2, sep = "_")
dip_pca <- paste(tbl_dip_pca$V1, tbl_dip_pca$V2, sep = "_")
dip_cdsns <- paste(tbl_dip_cdsns$V1, tbl_dip_cdsns$V2, sep = "_")
dip_pcaupns<- paste(tbl_dip_pcaupns$V1, tbl_dip_pcaupns$V2, sep = "_")
dip_cds <- paste(tbl_dip_cds$V1, tbl_dip_cds$V2, sep = "_")
dip_pca <- paste(tbl_dip_pca$V1, tbl_dip_pca$V2, sep = "_")
dip_cdsns <- paste(tbl_dip_cdsns$V1, tbl_dip_cdsns$V2, sep = "_")
dip_pcaupns <- unique(paste(tbl_dip_pcaupns$V1, tbl_dip_pcaupns$V2, sep = "_"))
mul_cds <- paste(tbl_mul_cds$V1, tbl_mul_cds$V2, sep = "_")
mul_pca <- paste(tbl_mul_pca$V1, tbl_mul_pca$V2, sep = "_")
mul_cdsns <- paste(tbl_mul_cdsns$V1, tbl_mul_cdsns$V2, sep = "_")
mul_pcaupns <- unique(paste(tbl_mul_pcaupns$V1, tbl_mul_pcaupns$V2, sep = "_"))
mul_cds <- paste(tbl_mul_cds$V1, tbl_mul_cds$V2, sep = "_")
mul_pca <- paste(tbl_mul_pca$V1, tbl_mul_pca$V2, sep = "_")
mul_cdsns <- paste(tbl_mul_cdsns$V1, tbl_mul_cdsns$V2, sep = "_")
# compare obtained SNPs with both methods
dip_cdsXpca <- intersect(dip_cds,dip_pca)
dip_cdsnsXpca <- intersect(dip_cds_ns,dip_pca)
dip_cdsXpcaXcdsns <- intersect(intersect(dip_cds, dip_pca), dip_cds_ns)
dip_cdsnsXpca <- intersect(dip_cdsns,dip_pca)
dip_cdsXpcaXcdsns <- intersect(intersect(dip_cds, dip_pca), dip_cdsns)
intersect(dip_cds, dip_pcaupns)
dip_A <- setdiff(setdiff(dip_cds, dip_cds_ns), dip_pca)
dip_B <- setdiff(setdiff(dip_cds_ns, dip_pca), dip_cds)
dip_A <- setdiff(setdiff(dip_cds, dip_cdsns), dip_pca)
dip_B <- setdiff(setdiff(dip_cdsns, dip_pca), dip_cds)
dip_C <- setdiff(dip_pca, dip_cds)
dip_AB <- setdiff(intersect(dip_cds, dip_cds_ns), dip_pca)
dip_AC <- setdiff(intersect(dip_cds, dip_pca), dip_cds_ns)
dip_BC <- setdiff(intersect(dip_cds_ns, dip_pca), dip_cds)
dip_ABC <- intersect(intersect(dip_cds, dip_pca), dip_cds_ns)
dip_AB <- setdiff(intersect(dip_cds, dip_cdsns), dip_pca)
dip_AC <- setdiff(intersect(dip_cds, dip_pca), dip_cdsns)
dip_BC <- setdiff(intersect(dip_cdsns, dip_pca), dip_cds)
dip_ABC <- intersect(intersect(dip_cds, dip_pca), dip_cdsns)
dip_plusplus <- intersect(dip_cds_ns, dip_pca_cds_up_ns)
dip_plusplus <- intersect(dip_cdsns, dip_pcaupns)
mul_cdsXpca <- intersect(mul_cds, mul_pca)
mul_cdsnsXpca <- intersect(mul_cds_ns, mul_pca)
mul_cdsXpcaXcdsns <- intersect(intersect(mul_cds, mul_pca), mul_cds_ns)
mul_cdsnsXpca <- intersect(mul_cdsns, mul_pca)
mul_cdsXpcaXcdsns <- intersect(intersect(mul_cds, mul_pca), mul_cdsns)
intersect(mul_cdsns, mul_pcaupns)
mul_A <- setdiff(setdiff(mul_cds, mul_cds_ns), mul_pca)
mul_B <- setdiff(setdiff(mul_cds_ns, mul_pca), mul_cds)
mul_A <- setdiff(setdiff(mul_cds, mul_cdsns), mul_pca)
mul_B <- setdiff(setdiff(mul_cdsns, mul_pca), mul_cds)
mul_C <- setdiff(mul_pca, mul_cds)
mul_AB <- setdiff(intersect(mul_cds, mul_cds_ns), mul_pca)
mul_AC <- setdiff(intersect(mul_cds, mul_pca), mul_cds_ns)
mul_BC <- setdiff(intersect(mul_cds_ns, mul_pca), mul_cds)
mul_ABC <- intersect(intersect(mul_cds, mul_pca), mul_cds_ns)
mul_AB <- setdiff(intersect(mul_cds, mul_cdsns), mul_pca)
mul_AC <- setdiff(intersect(mul_cds, mul_pca), mul_cdsns)
mul_BC <- setdiff(intersect(mul_cdsns, mul_pca), mul_cds)
mul_ABC <- intersect(intersect(mul_cds, mul_pca), mul_cdsns)
# tried all together but wonky
# mul_D <- setdiff(setdiff(mul_cds, mul_cdsns), mul_pca)
# mul_E <- setdiff(setdiff(mul_cdsns, mul_pca), mul_cds)
# mul_F <- setdiff(mul_pca, mul_cds)
# mul_G <- setdiff(setdiff(mul_pcaupns, mul_cds), mul_pca)
# mul_DE <- setdiff(setdiff(intersect(mul_cds, mul_cdsns), mul_pca), mul_pcaupns)
# mul_DF <- setdiff(setdiff(intersect(mul_cds, mul_pca), mul_cdsns), mul_pcaupns)
# mul_DG <- setdiff(setdiff(intersect(mul_cds, mul_pcaupns), mul_cdsns), mul_pca)
# mul_EF <- setdiff(setdiff(intersect(mul_cdsns, mul_pca), mul_cds), mul_pcaupns)
# mul_EG <- setdiff(setdiff(intersect(mul_cdsns, mul_pcaupns), mul_cds), mul_pca)
# mul_FG <- setdiff(setdiff(intersect(mul_pca, mul_pcaupns), mul_cds), mul_cdsns)
# mul_DEFG <- intersect(mul_cdsns, mul_pcaupns)
## euler diagrams
library(eulerr)
......@@ -58,19 +79,19 @@ dip_fit <- euler(c("A" = length(dip_cds)-length(dip_cdsXpca),
"A&B" = length(dip_cdsXpca)))
plot(dip_fit, quantities = TRUE, labels = TRUE)
dip_fit2 <- euler(c("CDS" = length(dip_A),
dip_fit2 <- euler(c("CDS" = length(dip_A),
"CDS_NS" = length(dip_B),
"PCA" = length(dip_C),
"PCA" = length(dip_C),
"CDS&CDS_NS" = length(dip_AB),
"CDS&PCA" = length(dip_AC),
"CDS&PCA" = length(dip_AC),
"CDS_NS&PCA" = length(dip_BC),
"CDS&CDS_NS&PCA" = length(dip_ABC)))
plot(dip_fit2, quantities = TRUE, labels = TRUE)
mul_fit <- euler(c("A" = length(mul_cds_id)-length(mul_comm),
"B" = length(mul_pca_id)-length(mul_comm),
"A&B" = length(mul_comm)))
mul_fit <- euler(c("A" = length(mul_cds)-length(mul_cdsXpca),
"B" = length(mul_pca)-length(mul_cdsXpca),
"A&B" = length(mul_cdsXpca)))
plot(mul_fit, quantities = TRUE, labels = TRUE)
mul_fit2 <- euler(c("CDS" = length(mul_A),
......@@ -81,3 +102,16 @@ mul_fit2 <- euler(c("CDS" = length(mul_A),
"CDS_NS&PCA" = length(mul_BC),
"CDS&CDS_NS&PCA" = length(mul_ABC)))
plot(mul_fit2, quantities = TRUE, labels = TRUE)
# mul_fit3 <- euler(c("CDS" = length(mul_D),
# "CDS_NS" = length(mul_E),
# "PCA_UPNS" = length(mul_G),
# "PCA" = length(mul_F),
# "CDS&CDS_NS" = length(mul_DE),
# "CDS&PCA" = length(mul_DF),
# "CDS&PCA_UPNS" = length(mul_DG),
# "CDS_NS&PCA" = length(mul_EF),
# "CDS_NS&PCA_UPNS" = length(mul_EG),
# "PCA&PCA_UPNS" = length(mul_FG),
# "CDS&CDS_NS&PCA&PCA_UPNS" = length(mul_DEFG)))
# plot(mul_fit3, quantities = TRUE, labels = TRUE)
......@@ -14,12 +14,12 @@ awk '!/#/' diplodus/diplodus_CDS_nonsynonymous.vcf | wc -l
#### get flanking region of non synonymous SNPs
i="diplodus"
i="mullus"
anvage flank -f ${i}_CDS_nonsynonymous.vcf -g genomes/${i}_genome_lgt6000.fasta -w 200 -o ${i}_CDS_nonsynonymous;
anvage flank -f ${i}/${i}_CDS_nonsynonymous.vcf -g genomes/${i}_genome_lgt6000.fasta -w 200 -o ${i}/${i}_CDS_nonsynonymous;
#### blast flanking sequences against uniprot
###### diplodus
blastx -db swissprot_nucl -query "diplodus_CDS_nonsynonymous_flanking.fasta" -task blastx -outfmt "7 delim=, qacc sacc evalue bitscore qcovus pident" -max_target_seqs 5 > diplodus_CDS_nonsynonymous_uniprot.blastout
blastx -db swissprot_nucl -query "diplodus/diplodus_CDS_nonsynonymous_flanking.fasta" -task blastx -outfmt "7 delim=, qacc sacc evalue bitscore qcovus pident" -max_target_seqs 5 > diplodus_CDS_nonsynonymous_uniprot.blastout
bash blastout2tsv.sh diplodus_CDS_nonsynonymous_uniprot.blastout diplodus_CDS_nonsynonymous
###### mullus
blastx -db swissprot_nucl -query "mullus_CDS_nonsynonymous_flanking.fasta" -task blastx -outfmt "7 delim=, qacc sacc evalue bitscore qcovus pident" -max_target_seqs 5 > mullus_CDS_nonsynonymous_uniprot.blastout
blastx -db swissprot_nucl -query "mullus/mullus_CDS_nonsynonymous_flanking.fasta" -task blastx -outfmt "7 delim=, qacc sacc evalue bitscore qcovus pident" -max_target_seqs 5 > mullus_CDS_nonsynonymous_uniprot.blastout
bash blastout2tsv.sh mullus_CDS_nonsynonymous_uniprot.blastout mullus_CDS_nonsynonymous
## make blast database based on swissprot records
##wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
wget ftp://ftp.ebi.ac.uk/pub/databases/fastafiles/uniprot/uniprotkb_swissprot.gz
zcat uniprotkb_swissprot.gz | awk '{if (/^>/) { print ">" $2} else { print $_}}' > swissprot.fa
gunzip uniprotkb_swissprot.gz
# on mac: does not work, error to debug
gunzip -c uniprotkb_swissprot.gz | awk '{if (/^>/) { print ">" $2} else { print $_}}' > swissprot.fa
# gunzip uniprotkb_swissprot.gz
makeblastdb -in swissprot.fa -out swissprot_nucl -dbtype prot
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment