Data required for variance analysis:**Expression matrix**and**Grouping information**

The data of TCGA only need the expression matrix, because the sample ID of TCGA is special. Whether the 14th and 15th bits of the sample ID are > =10 or <10 represents whether the sample is a normal sample or a tumor sample.

The starting point of the three difference analysis r packets is the count matrix (reads count matrix)The count matrix cannot be directly used for variance analysis, so the three R packets have their own processing methods for the count matrix.If you can’t get the count matrix

- Rsem: all three R packageshttps://www.jianshu.com/p/46b048220b88
- TPM: use the limma package for variance analysis (as a last resort)
- Fpkm, rpkm: convert to TPM, and use limma for difference analysis (as a last resort)https://mp.weixin.qq.com/s/_DtkxSfLGQHcRju66J4yTQ

### 1. use three r-packs for enrichment analysis

preparation

Expression matrix and grouping messages from:Three methods of TCGA data downloading and sorting

```
if(!require(stringr))install.packages('stringr')
if(!require(ggplotify))install.packages("ggplotify")
if(!require(patchwork))install.packages("patchwork")
if(!require(cowplot))install.packages("cowplot")
if(!require(DESeq2))BiocManager::install('DESeq2')
if(!require(edgeR))BiocManager::install('edgeR')
if(!require(limma))BiocManager::install('limma')
rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
table(Group)
```

##### 1.1 DESeq2

The deseq2 analysis requires two steps:

1. the deseqdatasetsetfrommatrix() function generates the data type required by deseq2 from the matrix;

2. deseq() for variance analysis

```
library(DESeq2)
colData <- data.frame(row.names =colnames(exp),
Condition=group) \\generated row name is sample ID, and column is data frame of grouping information
dds <- DESeqDataSetFromMatrix(
Countdata = exp, \\
Coldata = coldata, \e expressing the correspondence between the column names and groups of the matrix
Design = ~ condition) \\
DDS < - deseq (DDS) \
save(dds,file = paste0(proj,"_dd.Rdata"))
```

Extracting difference expression matrix from difference analysis results

```
load(file = paste0(proj,"_dd.Rdata"))
#Results function: extracts the differential expression results from DDS. The generated objects are still deseq2 objects, which can be converted into data frames
Res < - results (DDS, contrast = C ("condition", rev (levels (Group)))) \contrast parameter must be written in the format of the vector of the following three elements, and the order cannot be reversed (refer to the help document)
Reordered < - res[order (res$pvalue),] \
DEG < - as data. Frame (reordered) \
DEG = na Omit (DEG) \e without this step, some genes with low expression will appear Na after calculation, which will bring trouble to subsequent analysis and mapping
View(DEG)
```

Addition of change column marker gene up regulation and down regulation

```
#Set the threshold to mean+2sd
logFC_cutoff <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)) )
Logfc_ Cutoff \\_ Cutoff is different
# [1] 3.909132
k1 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange < -logFC_cutoff)
k2 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG$change)
# DOWN NOT UP
# 783 28724 841
head(DEG)
Deseq2_ DEG < - DEG \\
```

##### 1.2 edgeR

Perform differential expression analysis and extract differential expression matrix

```
library(edgeR)
DGE < - dgelist (counts=exp, group=group) \
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge)
Design < - model Matrix (~0+group) \
rownames(design)<-colnames(dge)
colnames(design)<-levels(Group)
dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
Fit2 < - glmlrt (fit, contrast=c (-1,1)) .
DEG=topTags(fit2, n=nrow(exp))
Deg=as data. Frame (DEG) \\
View(DEG)
```

Addition of change column marker gene up regulation and down regulation

```
logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
logFC_cutoff
# [1] 4.290788
k1 = (DEG$PValue < 0.05)&(DEG$logFC < -logFC_cutoff)
k2 = (DEG$PValue < 0.05)&(DEG$logFC > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
head(DEG)
table(DEG$change)
# DOWN NOT UP
# 533 28643 1172
edgeR_DEG <- DEG
```

##### 1.3 limma

The main difference between the difference analysis of TCGA gene expression count matrix with limma and the difference analysis of chip data with limma lies in voom standardization

```
library(limma)
Design < - model Matrix (~0+group) \
colnames(design)=levels(Group)
rownames(design)=colnames(exp)
DGE < - dgelist (counts=exp) \
dge <- calcNormFactors(dge)
v <- voom(dge,design, normalize="quantile")
fit <- lmFit(v, design)
Constraints = paste (Rev (levels (Group)), collapse = "-") \
constrasts
# [1] "tumor-normal"
cont.matrix <- makeContrasts(contrasts=constrasts,levels = design)
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)
DEG = topTable(fit2, coef=constrasts, n=Inf)
DEG = na.omit(DEG)
View(DEG)
```

```
logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
k1 = (DEG$P.Value < 0.05)&(DEG$logFC < -logFC_cutoff)
k2 = (DEG$P.Value < 0.05)&(DEG$logFC > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG$change)
head(DEG)
# DOWN NOT UP
# 1060 28915 373
limma_voom_DEG <- DEG
tj = data.frame(deseq2 = as.integer(table(DESeq2_DEG$change)),
edgeR = as.integer(table(edgeR_DEG$change)),
limma_voom = as.integer(table(limma_voom_DEG$change)),
row.names = c("down","not","up")
);tj
save(DESeq2_DEG,edgeR_DEG,limma_voom_DEG,Group,tj,file = paste0(proj,"_DEG.Rdata"))
```

##### 1.4 check the number of up and down regulated genes obtained by the three R packets

```
tj = data.frame(deseq2 = as.integer(table(DESeq2_DEG$change)),
edgeR = as.integer(table(edgeR_DEG$change)),
limma_voom = as.integer(table(limma_voom_DEG$change)),
row.names = c("down","not","up")
tj
# deseq2 edgeR limma_voom
# down 783 533 1060
# not 28724 28643 28915
# up 841 1172 373
```

The results are quite different, but there is no difference between right and wrong, but the algorithm is different.

### 2. visualization of difference analysis results

```
rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
load("TCGA-CHOL_DEG.Rdata")
If (! Require (tinyarray)) devtools:: install_ Local ("tinyarray master.zip", upgrade = f) \\
library(ggplot2)
library(tinyarray)
exp[1:4,1:4]
#####2.1 PCA diagram
```

The matrix here is still count, and the count matrix is only used for variance analysis. See the following for the reasons:Differences between read count, CPM, rpkm, fpkm and TPM

```
#CPM removes the effect of library size
Dat = log2 (CPM (EXP) +1) \
pca.plot = draw_pca(dat,Group);pca.plot
save(pca.plot,file = paste0(proj,"_pcaplot.Rdata"))
```

##### 2.2 heat map

Select the differential genes made by three r-packets and draw the heat map

```
cg1 = rownames(DESeq2_DEG)[DESeq2_DEG$change !="NOT"]
cg2 = rownames(edgeR_DEG)[edgeR_DEG$change !="NOT"]
cg3 = rownames(limma_voom_DEG)[limma_voom_DEG$change !="NOT"]
H1 = draw_ Heatmap (dat[cg1,], group, n\u cutoff = 2) \n_ Cutoff=2, color assignment is -2 to 2, and beyond this range, the color is the same as -2 or 2 to eliminate the influence of extreme value
h2 = draw_heatmap(dat[cg2,],Group,n_cutoff = 2)
h3 = draw_heatmap(dat[cg3,],Group,n_cutoff = 2)
```

##### 2.3 volcano map

Calculate the mean+2sd threshold (the first three R packets are all called logfc\u cutoff, and they are not saved. Recalculate here)

```
m2d = function(x){
mean(abs(x))+2*sd(abs(x))
}
v1 = draw_volcano(DESeq2_DEG,pkg = 1,logFC_cutoff = m2d(DESeq2_DEG$log2FoldChange))
v2 = draw_volcano(edgeR_DEG,pkg = 2,logFC_cutoff = m2d(edgeR_DEG$logFC))
v3 = draw_volcano(limma_voom_DEG,pkg = 3,logFC_cutoff = m2d(limma_voom_DEG$logFC))
#PKG parameters: a integer, means which differential analysis packages you used, we support three packages by now, 1,2,3,4 carefully means "deseq2", "edge", "limma (voom)", "limma"
```

##### Jigsaw puzzle

```
library(patchwork)
(h1 + h2 + h3) / (v1 + v2 + v3) +plot_layout(guides = 'collect') &theme(legend.position = "none")
ggsave(paste0(proj,"_heat_vo.png"),width = 15,height = 10)
```

### 3. comparison of three r-packet differential genes

```
rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
load("TCGA-CHOL_DEG.Rdata")
load("TCGA-CHOL_pcaplot.Rdata")
UP=function(df){
rownames(df)[df$change=="UP"]
}\
DOWN=function(df){
rownames(df)[df$change=="DOWN"]
}\
Up = intersect (intersect (up (deseq2\u DEG), up (edge\u DEG)), up (limma\u voom\u DEG)) \
Down = intersect (intersect (down (deseq2\u DEG), down (edge\u DEG)), down (limma\u voom\u DEG)) \
```

Mapping common and differential genes

```
dat = log2(cpm(exp)+1)
hp = draw_heatmap(dat[c(up,down),],Group,n_cutoff = 2)
```

Draw Venn diagrams for up-regulated and down regulated genes respectively

```
#The list accepted by the drawing function here must be a list with a name, which is the name of each category to appear on the graph
up_genes = list(Deseq2 = UP(DESeq2_DEG),
edgeR = UP(edgeR_DEG),
limma = UP(limma_voom_DEG))
down_genes = list(Deseq2 = DOWN(DESeq2_DEG),
edgeR = DOWN(edgeR_DEG),
limma = DOWN(limma_voom_DEG))
up. Plot < - draw_ Venn (up\u genes, "upgene") \
down.plot <- draw_venn(down_genes,"DOWNgene")
```

Jigsaw puzzle

```
library(patchwork)
#up.plot + down.plot
#Jigsaw puzzle
pca.plot + hp+up.plot +down.plot+ plot_layout(guides = "collect")
ggsave(paste0(proj,"_heat_ve_pca.png"),width = 15,height = 10)
```

The code comes from 2021 Shengxin skill tree data mining course