3. Big difference analysis R package: deseq2, edger and limma

Time:2022-5-27

Data required for variance analysis:Expression matrixandGrouping information
The data of TCGA only need the expression matrix, because the sample ID of TCGA is special. Whether the 14th and 15th bits of the sample ID are > =10 or <10 represents whether the sample is a normal sample or a tumor sample.

The starting point of the three difference analysis r packets is the count matrix (reads count matrix)The count matrix cannot be directly used for variance analysis, so the three R packets have their own processing methods for the count matrix.

If you can’t get the count matrix

1. use three r-packs for enrichment analysis

preparation
Expression matrix and grouping messages from:Three methods of TCGA data downloading and sorting

if(!require(stringr))install.packages('stringr')
if(!require(ggplotify))install.packages("ggplotify")
if(!require(patchwork))install.packages("patchwork")
if(!require(cowplot))install.packages("cowplot")
if(!require(DESeq2))BiocManager::install('DESeq2')
if(!require(edgeR))BiocManager::install('edgeR')
if(!require(limma))BiocManager::install('limma')

rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
table(Group)
1.1 DESeq2

The deseq2 analysis requires two steps:
1. the deseqdatasetsetfrommatrix() function generates the data type required by deseq2 from the matrix;
2. deseq() for variance analysis

library(DESeq2)
colData <- data.frame(row.names =colnames(exp), 
                      Condition=group) \\generated row name is sample ID, and column is data frame of grouping information
dds <- DESeqDataSetFromMatrix(
    Countdata = exp, \\ 
    Coldata = coldata, \e expressing the correspondence between the column names and groups of the matrix
    Design = ~ condition) \\
DDS < - deseq (DDS) \
save(dds,file = paste0(proj,"_dd.Rdata"))

Extracting difference expression matrix from difference analysis results

load(file = paste0(proj,"_dd.Rdata"))
#Results function: extracts the differential expression results from DDS. The generated objects are still deseq2 objects, which can be converted into data frames
Res < - results (DDS, contrast = C ("condition", rev (levels (Group)))) \contrast parameter must be written in the format of the vector of the following three elements, and the order cannot be reversed (refer to the help document)
Reordered < - res[order (res$pvalue),] \
DEG < - as data. Frame (reordered) \
DEG = na Omit (DEG) \e without this step, some genes with low expression will appear Na after calculation, which will bring trouble to subsequent analysis and mapping
View(DEG)
3. Big difference analysis R package: deseq2, edger and limma

Addition of change column marker gene up regulation and down regulation

#Set the threshold to mean+2sd
logFC_cutoff <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)) )
Logfc_ Cutoff \\_ Cutoff is different
# [1] 3.909132
k1 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange < -logFC_cutoff)
k2 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG$change)
# DOWN   NOT    UP 
#  783 28724   841 
head(DEG)

Deseq2_ DEG < - DEG \\
1.2 edgeR

Perform differential expression analysis and extract differential expression matrix

library(edgeR)

DGE < - dgelist (counts=exp, group=group) \
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge) 

Design < - model Matrix (~0+group) \
rownames(design)<-colnames(dge)
colnames(design)<-levels(Group)

dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)

fit <- glmFit(dge, design)
Fit2 < - glmlrt (fit, contrast=c (-1,1))              .

DEG=topTags(fit2, n=nrow(exp))
Deg=as data. Frame (DEG) \\
View(DEG)
3. Big difference analysis R package: deseq2, edger and limma

FDR is a kind of p.adj

Addition of change column marker gene up regulation and down regulation

logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
logFC_cutoff
# [1] 4.290788
k1 = (DEG$PValue < 0.05)&(DEG$logFC < -logFC_cutoff)
k2 = (DEG$PValue < 0.05)&(DEG$logFC > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))

head(DEG)
table(DEG$change)
# DOWN   NOT    UP 
#  533 28643  1172 
edgeR_DEG <- DEG
1.3 limma

The main difference between the difference analysis of TCGA gene expression count matrix with limma and the difference analysis of chip data with limma lies in voom standardization

library(limma)

Design < - model Matrix (~0+group) \
colnames(design)=levels(Group)
rownames(design)=colnames(exp)

DGE < - dgelist (counts=exp) \
dge <- calcNormFactors(dge)

v <- voom(dge,design, normalize="quantile")
fit <- lmFit(v, design)

Constraints = paste (Rev (levels (Group)), collapse = "-") \
constrasts
# [1] "tumor-normal"
cont.matrix <- makeContrasts(contrasts=constrasts,levels = design) 
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)

DEG = topTable(fit2, coef=constrasts, n=Inf)
DEG = na.omit(DEG)
View(DEG)
3. Big difference analysis R package: deseq2, edger and limma

logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
k1 = (DEG$P.Value < 0.05)&(DEG$logFC < -logFC_cutoff)
k2 = (DEG$P.Value < 0.05)&(DEG$logFC > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG$change)
head(DEG)
# DOWN   NOT    UP 
# 1060 28915   373 
limma_voom_DEG <- DEG
tj = data.frame(deseq2 = as.integer(table(DESeq2_DEG$change)),
           edgeR = as.integer(table(edgeR_DEG$change)),
           limma_voom = as.integer(table(limma_voom_DEG$change)),
           row.names = c("down","not","up")
          );tj
save(DESeq2_DEG,edgeR_DEG,limma_voom_DEG,Group,tj,file = paste0(proj,"_DEG.Rdata"))
1.4 check the number of up and down regulated genes obtained by the three R packets
tj = data.frame(deseq2 = as.integer(table(DESeq2_DEG$change)),
                edgeR = as.integer(table(edgeR_DEG$change)),
                limma_voom = as.integer(table(limma_voom_DEG$change)),
                row.names = c("down","not","up")
tj
#      deseq2 edgeR limma_voom
# down    783   533       1060
# not   28724 28643      28915
# up      841  1172        373

The results are quite different, but there is no difference between right and wrong, but the algorithm is different.

2. visualization of difference analysis results

rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
load("TCGA-CHOL_DEG.Rdata")
If (! Require (tinyarray)) devtools:: install_ Local ("tinyarray master.zip", upgrade = f) \\
library(ggplot2)
library(tinyarray)
exp[1:4,1:4]
#####2.1 PCA diagram

The matrix here is still count, and the count matrix is only used for variance analysis. See the following for the reasons:Differences between read count, CPM, rpkm, fpkm and TPM

#CPM removes the effect of library size
Dat = log2 (CPM (EXP) +1)  \
pca.plot = draw_pca(dat,Group);pca.plot
save(pca.plot,file = paste0(proj,"_pcaplot.Rdata"))
3. Big difference analysis R package: deseq2, edger and limma

PCA diagram based on CPM data
2.2 heat map

Select the differential genes made by three r-packets and draw the heat map

cg1 = rownames(DESeq2_DEG)[DESeq2_DEG$change !="NOT"]
cg2 = rownames(edgeR_DEG)[edgeR_DEG$change !="NOT"]
cg3 = rownames(limma_voom_DEG)[limma_voom_DEG$change !="NOT"]

H1 = draw_ Heatmap (dat[cg1,], group, n\u cutoff = 2) \n_ Cutoff=2, color assignment is -2 to 2, and beyond this range, the color is the same as -2 or 2 to eliminate the influence of extreme value
h2 = draw_heatmap(dat[cg2,],Group,n_cutoff = 2)
h3 = draw_heatmap(dat[cg3,],Group,n_cutoff = 2)
2.3 volcano map

Calculate the mean+2sd threshold (the first three R packets are all called logfc\u cutoff, and they are not saved. Recalculate here)

m2d = function(x){
  mean(abs(x))+2*sd(abs(x))
}

v1 = draw_volcano(DESeq2_DEG,pkg = 1,logFC_cutoff = m2d(DESeq2_DEG$log2FoldChange))
v2 = draw_volcano(edgeR_DEG,pkg = 2,logFC_cutoff = m2d(edgeR_DEG$logFC))
v3 = draw_volcano(limma_voom_DEG,pkg = 3,logFC_cutoff = m2d(limma_voom_DEG$logFC))
#PKG parameters: a integer, means which differential analysis packages you used, we support three packages by now, 1,2,3,4 carefully means "deseq2", "edge", "limma (voom)", "limma"
Jigsaw puzzle
library(patchwork)
(h1 + h2 + h3) / (v1 + v2 + v3) +plot_layout(guides = 'collect') &theme(legend.position = "none")
ggsave(paste0(proj,"_heat_vo.png"),width = 15,height = 10)
3. Big difference analysis R package: deseq2, edger and limma

3. comparison of three r-packet differential genes

rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
load("TCGA-CHOL_DEG.Rdata")
load("TCGA-CHOL_pcaplot.Rdata")
UP=function(df){
  rownames(df)[df$change=="UP"]
}\
DOWN=function(df){
  rownames(df)[df$change=="DOWN"]
}\

Up = intersect (intersect (up (deseq2\u DEG), up (edge\u DEG)), up (limma\u voom\u DEG)) \
Down = intersect (intersect (down (deseq2\u DEG), down (edge\u DEG)), down (limma\u voom\u DEG)) \

Mapping common and differential genes

dat = log2(cpm(exp)+1)
hp = draw_heatmap(dat[c(up,down),],Group,n_cutoff = 2)

Draw Venn diagrams for up-regulated and down regulated genes respectively

#The list accepted by the drawing function here must be a list with a name, which is the name of each category to appear on the graph
up_genes = list(Deseq2 = UP(DESeq2_DEG),
          edgeR = UP(edgeR_DEG),
          limma = UP(limma_voom_DEG))
down_genes = list(Deseq2 = DOWN(DESeq2_DEG),
          edgeR = DOWN(edgeR_DEG),
          limma = DOWN(limma_voom_DEG))

up. Plot < - draw_ Venn (up\u genes, "upgene") \
down.plot <- draw_venn(down_genes,"DOWNgene")

Jigsaw puzzle

library(patchwork)
#up.plot + down.plot
#Jigsaw puzzle
pca.plot + hp+up.plot +down.plot+ plot_layout(guides = "collect")
ggsave(paste0(proj,"_heat_ve_pca.png"),width = 15,height = 10)
3. Big difference analysis R package: deseq2, edger and limma


The code comes from 2021 Shengxin skill tree data mining course