Difference analysis + volcano map + Cox model construction

Time:2022-1-15

Developeppaper: how to analyze the difference of transcriptome data without biological duplication?

The data required by edger is the reads number. You can set the BCV value for single sample difference analysis.
The edge package can perform non repetitive difference analysis, but you need to specify a dispersion value (set the BCV value), so that the results are more subjective, and different people can have different results. Generally, if it is good human data controlled by experiments, BCV = 0.4 is selected, and BCV = 0.1 is selected for better model organisms.

6 luad verification

6.2 differential gene analysis and functional analysis

Differential gene analysis

library(limma)
rt=read.table("OV.txt",header=T,sep="\t",check.names=F)
#If a gene occupies more than one line, take the mean
rt=as.matrix(rt)
rownames(rt)=rt[,1]
exp=rt[,2:ncol(rt)]
dimnames=list(rownames(exp),colnames(exp))
data=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
data=avereps(data)
data<-as.data.frame(data)
nrow(data)
data=data[rowMeans(data)>0.5,]

group_list=c(rep("high",177),rep("low",176))  
table(group_list)



library(edgeR)

dge <- DGEList(counts=data,group=group_list)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge) 

design <- model.matrix(~0+group_list)
rownames(design)<-colnames(dge)
colnames(design)<-levels(group_list)

dge <- estimateGLMCommonDisp(dge,design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)

fit <- glmFit(dge, design)
fit2 <- glmLRT(fit, contrast=c(-1,1)) 

DEG=topTags(fit2, n=nrow(data))
DEG=as.data.frame(DEG)
#logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
logFC_cutoff <- 2
DEG$change = as.factor(
  ifelse(DEG$PValue < 0.05 & abs(DEG$logFC) > logFC_cutoff,
         ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT'))
head(DEG)
table(DEG$change)
edgeR_DEG <- DEG

#Extraction of differential gene expression matrix
DEG<-DEG[DEG$change!="NOT",]
library(data.table)
#Fintersect intersection
a<-data.table(rownames(DEG))
diff<-data[match(a$V1,row.names(data)),]
write.csv(diff,file = "diff.csv",quote=F,row.names = T)

source("3-plotfunction.R")
logFC_cutoff <- 2
dat = log(data+1)
pca.plot = draw_pca(dat,group_list)

cg2 = rownames(edgeR_DEG)[edgeR_DEG$change !="NOT"]
h2 = draw_heatmap(data[cg2,],group_list)
v2 = draw_volcano(test = edgeR_DEG[,c(1,4,6)],pkg = 2)
v2
library(patchwork)
(h2) / (v2) +plot_layout(guides = 'collect')
ggsave("heat_volcano.png",width = 7,height = 5)
Difference analysis + volcano map + Cox model construction

Volcanic map

functional analysis

library("org.Hs.eg.db")
rt=read.csv("diff.csv",sep=",",check.names=F,header=T)
rt<-rt[,1]
rt<-as.data.frame(rt)

genes=as.vector(rt[,1])
entrezIDs <- mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA)
entrezIDs <- as.character(entrezIDs)
out=cbind(rt,entrezID=entrezIDs)
write.table(out,file="id.txt",sep="\t",quote=F,row.names=F)


###############Go analysis
library("clusterProfiler")
library("org.Hs.eg.db")
library("enrichplot")
library("ggplot2")

rt=read.table("id.txt",sep="\t",header=T,check.names=F)
rt=rt[is.na(rt[,"entrezID"])==F,]

cor=rt$cor
gene=rt$entrezID
names(cor)=gene

#Go enrichment analysis
kk <- enrichGO(gene = gene,
               OrgDb = org.Hs.eg.db, 
               pvalueCutoff =0.05, 
               qvalueCutoff = 0.05,
               ont="all",
               readable =T)
write.table(kk,file="GO.txt",sep="\t",quote=F,row.names = F)

#Histogram
tiff(file="GO.tiff",width = 26,height = 20,units ="cm",compression="lzw",bg="white",res=600)
barplot(kk, drop = TRUE, showCategory =10,split="ONTOLOGY") + facet_grid(ONTOLOGY~., scale='free')
dev.off()


#####################KEGG analysis
library("clusterProfiler")
library("org.Hs.eg.db")
library("enrichplot")
library("ggplot2")

setwd("C:\\Users\\lexb4\\Desktop\\CCLE\.KEGG")
rt=read.table("id.txt",sep="\t",header=T,check.names=F)
rt=rt[is.na(rt[,"entrezID"])==F,]

cor=rt$cor
gene=rt$entrezID
names(cor)=gene

#KEGG enrichment analysis
kk <- enrichKEGG(gene = gene, organism = "hsa", pvalueCutoff =0.05, qvalueCutoff =0.05)
write.table(kk,file="KEGG.txt",sep="\t",quote=F,row.names = F)

#Histogram
tiff(file="barplot.tiff",width = 20,height = 12,units ="cm",compression="lzw",bg="white",res=600)
barplot(kk, drop = TRUE, showCategory = 20)
dev.off()

6.3 batch km curve

Input data km Txt, the data is fpkm

Difference analysis + volcano map + Cox model construction

KM.txt
library(survival)
rt=read.table("KM.txt",header=T,sep="\t",check.names=F)
outTab=data.frame()

for(gene in colnames(rt[,4:ncol(rt)])){
  a=rt[,gene]<=median(rt[,gene])
  diff=survdiff(Surv(futime, fustat) ~a,data = rt)
  pValue=1-pchisq(diff$chisq,df=1)
  outTab=rbind(outTab,cbind(gene=gene,pvalue=pValue))
  
  fit <- survfit(Surv(futime, fustat) ~ a, data = rt)
  summary(fit)
  #Only genes with P < 0.05 were drawn.
  if(pValue<0.05){
    if(pValue<0.001){
      pValue="<0.001"
    }else{
      pValue=round(pValue,3)
      pValue=paste0("=",pValue)
    }
    pdf(file=paste(gene,".survival.pdf",sep=""),
        width=6,
        height=6)
    plot(fit, 
         lwd=2,
         col=c("red","blue"),
         xlab="Time (year)",
         mark. Time = t, # display deleted data
         ylab="Survival rate",
         main=paste(gene,"(p", pValue ,")",sep=""))
    legend("topright", 
           c("High","Low"), 
           lwd=2, 
           col=c("red","blue"))
    dev.off()
  }
}
write.table(outTab,file="survival.xls",sep="\t",row.names=F,quote=F)

6.4 Cox model construction of differential genes

The Cox model was constructed with fpkm data, and the OS related genes were analyzed.
Input data exptime txt

Difference analysis + volcano map + Cox model construction

exptime.txt

Single factor Cox

Pfilter = 0.01 # defines single factor significance
Library (survival) # reference package
rt=read. Table ("exptime. TXT", header = t, Sep = "\ T", check. Names = f, row. Names = 1) # reads the input file

sigGenes=c("futime","fustat")
outTab=data.frame()
for(i in colnames(rt[,3:ncol(rt)])){
 cox <- coxph(Surv(futime, fustat) ~ rt[,i], data = rt)
 coxSummary = summary(cox)
 coxP=coxSummary$coefficients[,"Pr(>|z|)"]
 outTab=rbind(outTab,
              cbind(id=i,
              HR=coxSummary$conf.int[,"exp(coef)"],
              HR.95L=coxSummary$conf.int[,"lower .95"],
              HR.95H=coxSummary$conf.int[,"upper .95"],
              pvalue=coxSummary$coefficients[,"Pr(>|z|)"])
              )
  if(coxP<pFilter){
      sigGenes=c(sigGenes,i)
  }
}
write.table(outTab,file="uniCox.xls",sep="\t",row.names=F,quote=F)
uniSigExp=rt[,sigGenes]
uniSigExp=cbind(id=row.names(uniSigExp),uniSigExp)
write.table(uniSigExp,file="uniSigExp.txt",sep="\t",row.names=F,quote=F)

Lasso regression

library("glmnet")
library("survival")
rt=read. Table ("unisigexp. TXT", header = t, Sep = "\ T", row. Names = 1, check. Names = f) # reads the file
rt$futime[rt$futime<=0]=1

x=as.matrix(rt[,c(3:ncol(rt))])
y=data.matrix(Surv(rt$futime,rt$fustat))

fit <- glmnet(x, y, family = "cox", maxit = 1000)
pdf("lambda.pdf")
plot(fit, xvar = "lambda", label = TRUE)
dev.off()

cvfit <- cv.glmnet(x, y, family="cox", maxit = 1000)
pdf("cvfit.pdf")
plot(cvfit)
abline(v=log(c(cvfit$lambda.min,cvfit$lambda.1se)),lty="dashed")
dev.off()

coef <- coef(fit, s = cvfit$lambda.min)
index <- which(coef != 0)
actCoef <- coef[index]
lassoGene=row.names(coef)[index]
lassoGene=c("futime","fustat",lassoGene)
lassoSigExp=rt[,lassoGene]
lassoSigExp=cbind(id=row.names(lassoSigExp),lassoSigExp)
write.table(lassoSigExp,file="lassoSigExp.txt",sep="\t",row.names=F,quote=F)

Construction of multifactor Cox model

library(survival)
library(survminer)

rt=read. Table ("lassosigexp. TXT", header = t, Sep = "\ T", check. Names = f, row. Names = 1) # reads the train input file
rt[,"futime"]=rt[,"futime"]/365

#Build Cox model using train group
multiCox=coxph(Surv(futime, fustat) ~ ., data = rt)
multiCox=step(multiCox,direction = "both")
multiCoxSum=summary(multiCox)

#Output model related information
outTab=data.frame()
outTab=cbind(
             coef=multiCoxSum$coefficients[,"coef"],
             HR=multiCoxSum$conf.int[,"exp(coef)"],
             HR.95L=multiCoxSum$conf.int[,"lower .95"],
             HR.95H=multiCoxSum$conf.int[,"upper .95"],
             pvalue=multiCoxSum$coefficients[,"Pr(>|z|)"])
outTab=cbind(id=row.names(outTab),outTab)
write.table(outTab,file="multiCox.xls",sep="\t",row.names=F,quote=F)

#Mapping the forest
pdf(file="forest.pdf",
       Width = 8, # picture width
       Height = 5, # picture height
       )
ggforest(multiCox,
         main = "Hazard ratio",
         cpositions = c(0.02,0.22, 0.4), 
         fontsize = 0.7, 
         refLabel = "reference", 
         noDigits = 2)
dev.off()

#Output train group risk file
Riskscore = predict (multicox, type = "risk", newdata = RT) # use the model obtained by train to predict the risk of train samples
coxGene=rownames(multiCoxSum$coefficients)
coxGene=gsub("`","",coxGene)
outCol=c("futime","fustat",coxGene)
medianTrainRisk=median(riskScore)
risk=as.vector(ifelse(riskScore>medianTrainRisk,"high","low"))
write.table(cbind(id=rownames(cbind(rt[,outCol],riskScore,risk)),cbind(rt[,outCol],riskScore,risk)),
    file="riskTrain.txt",
    sep="\t",
    quote=F,
    row.names=F)
####################The following is the risk file output for the validation set#################
#Output test group risk file
rtTest=read. Table ("test. TXT", header = t, Sep = "\ T", check. Names = f, row. Names = 1) # reads the test input file
rtTest[,"futime"]=rtTest[,"futime"]/365
Riskscoretest = predict (multicox, type = "risk", newdata = rttest) # use the model obtained by train to predict the risk of test samples
riskTest=as.vector(ifelse(riskScoreTest>medianTrainRisk,"high","low"))
write.table(cbind(id=rownames(cbind(rtTest[,outCol],riskScoreTest,riskTest)),cbind(rtTest[,outCol],riskScore=riskScoreTest,risk=riskTest)),
    file="riskTest.txt",
    sep="\t",
    quote=F,
    row.names=F)

6.5 risk graph drawing

Risk survival curve

library(survival)

#Draw the survival curve of train group
rt=read.table("riskTrain.txt",header=T,sep="\t")
diff=survdiff(Surv(futime, fustat) ~risk,data = rt)
pValue=1-pchisq(diff$chisq,df=1)
pValue=signif(pValue,4)
pValue=format(pValue, scientific = TRUE)
fit <- survfit(Surv(futime, fustat) ~ risk, data = rt)
Summary (FIT) # view 5-year survival
pdf(file="survivalTrain.pdf",width=5.5,height=5)
plot(fit, 
     lwd=2,
     col=c("red","blue"),
     xlab="Time (year)",
     ylab="Survival rate",
     main=paste("Survival curve (p=", pValue ,")",sep=""),
     mark.time=T)
legend("topright", 
       c("high risk", "low risk"),
       lwd=2,
       col=c("red","blue"))
dev.off()

#Draw the survival curve of test group
rt=read.table("riskTest.txt",header=T,sep="\t")
diff=survdiff(Surv(futime, fustat) ~risk,data = rt)
pValue=1-pchisq(diff$chisq,df=1)
pValue=signif(pValue,4)
pValue=format(pValue, scientific = TRUE)
fit <- survfit(Surv(futime, fustat) ~ risk, data = rt)
Summary (FIT) # view 5-year survival
pdf(file="survivalTest.pdf",width=5.5,height=5)
plot(fit, 
     lwd=2,
     col=c("red","blue"),
     xlab="Time (year)",
     ylab="Survival rate",
     main=paste("Survival curve (p=", pValue ,")",sep=""),
     mark.time=T)
legend("topright", 
       c("high risk", "low risk"),
       lwd=2,
       col=c("red","blue"))
dev.off()
Difference analysis + volcano map + Cox model construction

Km curve

Risk ROC curve

library(survivalROC)

rt=read.table("riskTrain.txt",header=T,sep="\t",check.names=F,row.names=1)
pdf(file="rocTrain.pdf",width=6,height=6)
par(oma=c(0.5,1,0,1),font.lab=1.5,font.axis=1.5)
roc=survivalROC(Stime=rt$futime, status=rt$fustat, marker = rt$riskScore, 
      predict.time =1, method="KM")
plot(roc$FP, roc$TP, type="l", xlim=c(0,1), ylim=c(0,1),col='red', 
  xlab="False positive rate", ylab="True positive rate",
  main=paste("ROC curve (", "AUC = ",round(roc$AUC,3),")"),
  lwd = 2, cex.main=1.3, cex.lab=1.2, cex.axis=1.2, font=1.2)
abline(0,1)
dev.off()

rt=read.table("riskTest.txt",header=T,sep="\t",check.names=F,row.names=1)
pdf(file="rocTest.pdf",width=6,height=6)
par(oma=c(0.5,1,0,1),font.lab=1.5,font.axis=1.5)
roc=survivalROC(Stime=rt$futime, status=rt$fustat, marker = rt$riskScore, 
      predict.time =1, method="KM")
plot(roc$FP, roc$TP, type="l", xlim=c(0,1), ylim=c(0,1),col='red', 
  xlab="False positive rate", ylab="True positive rate",
  main=paste("ROC curve (", "AUC = ",round(roc$AUC,3),")"),
  lwd = 2, cex.main=1.3, cex.lab=1.2, cex.axis=1.2, font=1.2)
abline(0,1)
dev.off()
Difference analysis + volcano map + Cox model construction

ROC curve