deseq2.script.R 6.9 KB
Newer Older
1
2
3
4
5
6
#log <- file(snakemake@log[[1]], open="wt")
#sink(log)
#sink(log, type="message")
library(DESeq2)
library(apeglm)
library(tximport)
7
library(GenomicFeatures)
mmassaviol's avatar
mmassaviol committed
8
library(ggplot2)
9

khalid's avatar
khalid committed
10

11

12
13
samples = read.table(snakemake@config[["popmap_file"]],stringsAsFactors=F)$V1
conditions = read.table(snakemake@config[["popmap_file"]],stringsAsFactors=F)$V2
14
15
16
#samples = c("DRR016125","DRR016126","DRR016127","DRR016128","DRR016129","DRR016130","DRR016131","DRR016132")
#files = file.path("/home/mbb/Documents/results2/kallisto/quant",samples,"abundance.h5")
files = sort(snakemake@input[["counts"]])
17
names(files) = samples
18

19
20
21
22
23
24
25
26
27
28
29
if (snakemake@config[["quantification"]] == "htseq_count")
{
   print("Quantification was done with htseq-count !")
   sampleTable <- data.frame(
		sampleName = samples,
		fileName   = files,
		condition  = conditions
		)
    # create the DESeq data object
    dds <- DESeqDataSetFromHTSeqCount(
		sampleTable = sampleTable, 
khalid's avatar
khalid committed
30
		directory = "", 
31
32
33
		design = ~ condition
		)

khalid's avatar
khalid committed
34
} else{ # counts where generated with one of these software "salmon", "sailfish", "alevin", "kallisto", "rsem", "stringtie"
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
    if (snakemake@params[["tx2gene"]] == TRUE){
      # conversion transcrits vers gènes
      TxDb <- makeTxDbFromGFF(file = snakemake@params[["annotations"]])
      k <- keys(TxDb, keytype = "TXNAME")
      tx2gene <- select(TxDb, k, "GENEID", "TXNAME")
      txi <- tximport(files, type = snakemake@config[["quantification"]], tx2gene = tx2gene) 
    } else {
      txi <- tximport(files, type = snakemake@config[["quantification"]], txOut = TRUE) 
    }

    groups = as.factor(conditions)
    designMat <- model.matrix(~groups)
    colnames(designMat)[2] = "condition"

    sampleTable <- data.frame(condition = factor(conditions))
    rownames(sampleTable) <- colnames(txi$counts)

    dds <- DESeqDataSetFromTximport(txi, sampleTable, designMat)
53
54
55
56
57
}

keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

mmassaviol's avatar
mmassaviol committed
58
dds <- DESeq(dds) #, fitType = snakemake@config[["deseq2_fitType"]], betaPrior = as.logical(snakemake@config[["deseq2_betaPrior"]]), test="Wald")
59
60
61
62
63
64

res <- results(dds)

save.image(snakemake@output[["r_data"]])
write.csv(res, snakemake@output[["de_table"]])

mmassaviol's avatar
mmassaviol committed
65
###### PREPARE REPORT ######
66
67
68
69
70
# normalized, variance-stabilized transformed counts for visualization
if (sum(keep) < 1000) {
  vsd = varianceStabilizingTransformation(dds)
  } else {
    vsd <- vst(dds, blind=FALSE)
71
}
72

mmassaviol's avatar
mmassaviol committed
73

74
75
dat <- plotPCA(vsd, intgroup="condition",returnData=TRUE)
percentVar <- round(100 * attr(dat, "percentVar"))
mmassaviol's avatar
mmassaviol committed
76

77
png(filename = snakemake@output[["PCA"]], height=800, width=800)
78
 p <- ggplot(dat,aes(x=PC1,y=PC2,col=group))
79
80
81
 p + geom_point(size=3) + xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + ggtitle("PCA on normalized, variance-stabilized transformed counts")
mmassaviol's avatar
mmassaviol committed
82
83
dev.off()

84

mmassaviol's avatar
mmassaviol committed
85
86
options(digit=4)

87
88
# resSig <- subset(res, padj < 0.1)
# resSigdf = data.frame(resSig@listData,row.names = resSig@rownames)
mmassaviol's avatar
mmassaviol committed
89
90
91

#datatable(resSigdf,options = list(scrollX = '300px'))

92
93
94
#topGenes = head(resSigdf[order(resSigdf$padj),], 40)
#top 40 genes basé sur les p-values
topGenes = head(as.data.frame(res[order(res$pvalue),]) , 40)
mmassaviol's avatar
mmassaviol committed
95
96
97

cat("# id: Top_genes
# section_name: 'Top 40 genes'
98
# description: 'Tableau des 40 top genes avec les plus faibles pvalues'
mmassaviol's avatar
mmassaviol committed
99
# format: 'tsv'
mmassaviol's avatar
mmassaviol committed
100
# plot_type: 'table'\ngene\t", file=snakemake@output[["Top_genes"]])
mmassaviol's avatar
mmassaviol committed
101

mmassaviol's avatar
mmassaviol committed
102
write.table(topGenes,snakemake@output[["Top_genes"]],append=TRUE,sep="\t")
mmassaviol's avatar
mmassaviol committed
103

104
png(filename = snakemake@output[["MA_plot"]], height=800, width=800)
khalid's avatar
khalid committed
105
 DESeq2::plotMA(res, ylim=c(-5,5))
mmassaviol's avatar
mmassaviol committed
106
107
dev.off()

mmassaviol's avatar
mmassaviol committed
108
png(filename = snakemake@output[["Volcano_plot"]], height=800, width=800)
khalid's avatar
khalid committed
109
110
111
112
  with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-2.5,2)))
  with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))
  with(subset(res, abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue), pch=20, col="orange"))
  with(subset(res, padj<.05 & abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue), pch=20, col="green"))
mmassaviol's avatar
mmassaviol committed
113
dev.off()
mmassaviol's avatar
mmassaviol committed
114
115


116
117
118
119
120
121
# Heatmap
library(pheatmap)
select <- order(rowMeans(counts(dds,normalized=TRUE)), decreasing=TRUE)[1:20]
df <- as.data.frame(colData(dds)[,c("condition")])
ntd <- normTransform(dds)
rownames(df) = samples
122
colnames(df) = "Conditions"
123
124
125
assayNTD = assay(ntd)

png(filename = snakemake@output[["Heatmap"]], height=800, width=800)
khalid's avatar
khalid committed
126
  pheatmap(assayNTD[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=TRUE, annotation_col=df)
127
dev.off()
128

khalid's avatar
khalid committed
129
speciesData = snakemake@params[["biomaRtDataset"]]
khalid's avatar
khalid committed
130

khalid's avatar
khalid committed
131
132
133
if (speciesData != "None") {
  # Add annotation with biomaRt
  library(biomaRt)
khalid's avatar
khalid committed
134
135
  tryCatch({
  mart <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="uswest.ensembl.org")
136

khalid's avatar
khalid committed
137
  species_mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL", host = "uswest.ensembl.org", dataset = speciesData)
khalid's avatar
khalid committed
138
139
140
141
142
143

  #for the moment the top genes change to significant genes ?
  #what if we have tarnscript ID instead of gene ID ?
  ann <- getBM(filter="ensembl_gene_id",value=rownames(topGenes),attributes=c("ensembl_gene_id","description","transcript_length"),mart=species_mart)

  go_ann <- getBM(filter="ensembl_gene_id",value=rownames(topGenes),attributes=c("ensembl_gene_id","description","go_id","name_1006","namespace_1003"),mart=species_mart)
khalid's avatar
khalid committed
144
145
146
147
148
149
  write.table(go_ann,paste0(snakemake@params[["outDir"]],"/TopGenes_go_ann_mqc.tsv"),sep="\t",row.names =FALSE, quote = FALSE)
  },
  error = function(err) {
    print("Can't access Ensembl Biomart !")
  }
  )
khalid's avatar
khalid committed
150
151
152
153
  #test for gene enrichment with gprofiler2
  library(gprofiler2)
  #for the moment the top genes change to significant genes ?
  organisme = gsub("_gene_ensembl", "", speciesData)
khalid's avatar
khalid committed
154
155
156
157
158
  #limit to GO sources

  sources = c("GO:BP", "GO:MF", "GO:CC")
  query = rownames(topGenes) #setNames(as.list(rownames(topGenes)), rownames(topGenes))
  gostres <- gost(query = query, 
khalid's avatar
khalid committed
159
                  organism = organisme, ordered_query = FALSE, 
khalid's avatar
khalid committed
160
                  multi_query = FALSE, significant = FALSE, exclude_iea = TRUE, 
khalid's avatar
khalid committed
161
162
163
                  measure_underrepresentation = FALSE, evcodes = FALSE, 
                  user_threshold = 0.05, correction_method = "g_SCS", 
                  domain_scope = "annotated", custom_bg = NULL, 
khalid's avatar
khalid committed
164
                  numeric_ns = "", sources = sources, as_short_link = FALSE)
khalid's avatar
khalid committed
165

khalid's avatar
khalid committed
166
167
168
  png(filename = paste0(snakemake@params[["outDir"]],"/gprofiler2_plot_mqc.png"), height=800, width=800)
    p <- gostplot(gostres, capped = FALSE, interactive = FALSE)
    print(p  + ggtitle("gprofiler2 enrichment analysis for top genes to find over-representation of terms from Gene Ontology"))
khalid's avatar
khalid committed
169
  dev.off()
mmassaviol's avatar
mmassaviol committed
170

khalid's avatar
khalid committed
171
172
173
174
175
176
177
  
  #la colonne parents est une liste qu'il faut transformer avant write.table !!
  #write.table(apply(gostres$result,2,as.character), paste0(snakemake@params[["outDir"]],"/gprofiler2_mqc.tsv"), sep="\t",row.names =FALSE, quote = FALSE)
  #ou
  ttable = cbind(1:nrow(gostres$result), gostres$result[,-14])
  colnames(ttable)[1]="id"
  write.table(ttable, paste0(snakemake@params[["outDir"]],"/gprofiler2_mqc.tsv"), sep="\t",row.names =FALSE, quote = FALSE)
178
179

}
khalid's avatar
khalid committed
180
181