2021-11-09 volcano map based on RNA SEQ table (second time)

Time:2022-5-24
setwd("C:\\Users\\Administrator.DESKTOP-4UQ3Q0K\\Desktop")
library("readxl")
data <- read_excel("RNA-seq.xlsx")
library(dplyr)
library(ggplot2)
library(ggrepel)
2021-11-09 volcano map based on RNA SEQ table (second time)

data
#Convert to tibble for subsequent use and remove unnecessary columns;
Data < - Data [C (- 10, - 11, - 14, - 15, - 16, - 19, - 20, - 21, - 22)] # don't try
#data <- as_tibble(data[c(-10,-11,-14,-15,-16,-19,-20,-21,-22)])
data$padj<-as.numeric(as.matrix(data$padj))
  #Take logarithm of Q value;
data$log10FDR <- -log10(data$padj)
   #Generating a grouping label of significant up and down data;
data$group <- case_when(data$log2FoldChange > 1 & data$padj < 0.05 ~ "Up",
                      data$log2FoldChange < -1 & data$padj < 0.05 ~ "Down",
                      abs(data$log2FoldChange) <= 1 ~ "None",
                      data$padj >= 0.05 ~ "None") 
#The 10 genes with the most significant expression differences were obtained;
top10sig <- filter(data,group!="None") %>% distinct(Symbol,.keep_all = T) %>% top_n(10,abs(log2FoldChange))
#The gene table of differentially expressed TOP10 was divided into up and down parts;
up <- filter(top10sig,group=="Up")
down <- filter(top10sig,group=="Down")
#Add a new column, mark the differential gene of TOP10 as 2 and the others as 1;
# data$gene_id<-as.numeric(as.matrix(data$gene_id))
data$size <- case_when(!(data$gene_id %in% top10sig$gene_id)~ 1,data$gene_id %in% top10sig$gene_id ~ 2)
#Extract the gene table of non TOP10;
df <- filter(data,size==1)
#Specify the drawing order and convert the group column to factor type;
df$group <- factor(df$group,
                        levels = c("Up","Down","None"),
                        ordered = T)
#Start drawing and establish mapping;
p0 <-ggplot(data=df,aes(log2FoldChange,log10FDR,color=group))
#Add scatter points;
p1 <- p0+geom_point(size=3)
p1
#Custom translucent color (red and green);
mycolor <- c("#FF99CC","#99CC00","gray80")
p22 <- p1 + scale_colour_manual(name="",values=alpha(mycolor,0.7))
p22
2021-11-09 volcano map based on RNA SEQ table (second time)

No TOP10 gene
#Continue to add the points corresponding to TOP10 gene;
p2 <- p22+geom_point(data=up,aes(log2FoldChange,log10FDR),
                    color="#FF9999",size=4,alpha=0.9)+
  geom_point(data=down,aes(log2FoldChange,log10FDR),
             color="#7cae00",size=4,alpha=0.9)
p2
2021-11-09 volcano map based on RNA SEQ table (second time)

Top 10 added
#Sets the size of the empty area at both ends of the axis range
p3<-p2+labs(y="-log10FDR")+
+     scale_y_continuous(limits = c(0, 15))+scale_x_continuous(limits = c(-5,25))
p3
2021-11-09 volcano map based on RNA SEQ table (second time)

Adjust the coordinate axis and delete the blank area

Next, add a leader to the scatter corresponding to the top10 gene. We mainly use geom_ text_ Repl () and geom_ label_ For the two functions of repl(), the following are some important parameters:
nudge_ X / Y: the distance between the data point and the corresponding data label. For example, 1 indicates that the label is 1 unit on the right / top of the point, while – 2.2 indicates that the label is 2.2 units on the left / bottom of the point;
Direction: label distribution direction, X indicates horizontal distribution, y indicates vertical distribution and both indicates random distribution;
segment. Size: Specifies the thickness of the line segment;
point. Padding: refers to the free area around the point. It determines the distance from the end point of the connecting line to the center of the data point. The unit is line.

p3<-p2+labs(y="-log10FDR")+scale_y_continuous(limits = c(0, 15))+scale_x_continuous(limits = c(-25,25))
set.seed(007)
p4 <- p3+geom_text_repel(data=top10sig,aes(log2FoldChange,log10FDR,label=Symbol),
                         force=120,color="grey20",size=5,
                         point.padding = 0.8,hjust = 0.5,
                         arrow = arrow(length = unit(0.01, "npc"),
                                       type = "open", ends = "last"),
                         segment.color="grey20",
                         segment.size=0.5,
                         segment.alpha=0.8,
                         nudge_x=0,
                         nudge_y=0.5)
2021-11-09 volcano map based on RNA SEQ table (second time)

p5 <- p4+geom_hline(yintercept = c(-log10(0.05)),
                    size = 1.4,
                    color = "orange",
                    lty = "dashed")+
    geom_vline(xintercept = c(-1,1),
               size = 1.4,
               color = "orange",
               lty = "dashed")
p5
2021-11-09 volcano map based on RNA SEQ table (second time)

With auxiliary line
p6 <- p5+geom_label_repel(
data = up,aes(log2FoldChange,log10FDR,label=Symbol),
nudge_x = 2,
nudge_y = 1,
color = "white",
alpha = 0.9,
point.padding = 0.8,
size = 5,
fill = "#96C93D",
segment.size = 0.8,
segment.color = "grey50",
direction = "y",
hjust = 0.5) +
geom_label_repel(
data = down,aes(log2FoldChange,log10FDR,label=Symbol),
nudge_x = -10,
nudge_y = 2,
color = "white",
alpha = 0.9,
point.padding = 0.8,
size = 5,
fill = "#9881F5",
segment.size = 0.8,
segment.color = "grey50",
direction = "y",
hjust = 0.5)
> p6
2021-11-09 volcano map based on RNA SEQ table (second time)

Nudge as appropriate_ X = – 10:

2021-11-09 volcano map based on RNA SEQ table (second time)

I don’t know which one is suitable:) I think it’s better not to pull a long line. Anyway, the purpose is to highlight which genes are down regulated

  • The coordinate axis can be truncated to combine the outlier values on one map, but it seems more convenient and beautiful to directly take a screenshot and set the coordinate axis of the remaining cluster values
    Generate outliers:
p6 <- p5+geom_label_repel(
    data = up,aes(log2FoldChange,log10FDR,label=Symbol),
    nudge_x = 2,
    nudge_y = 1,
    color = "white",
    alpha = 0.9,
    point.padding = 0.8,
    size = 5,
    fill = "#96C93D",
    segment.size = 0.8,
    segment.color = "grey50",
    direction = "y",
    hjust = 0.5) +
    geom_label_repel(
        data = down,aes(log2FoldChange,log10FDR,label=Symbol),
        nudge_x = -1,
        nudge_y = 0.5,
        color = "white",
        alpha = 0.9,
        point.padding = 0.8,
        size = 5,
        fill = "#9881F5",
        segment.size = 0.8,
        segment.color = "grey50",
        direction = "y",
        hjust = 0.5)

Generate aggregation section:

p3<-p2+labs(y="-log10FDR")+ scale_y_continuous(limits = c(0, 5))+scale_x_continuous(limits = c(-5,5))
p4 <- p3+geom_hline(yintercept = c(-log10(0.05)),
                    size = 1.4,
                    color = "orange",
                    lty = "dashed")+
    geom_vline(xintercept = c(-1,1),
               size = 1.4,
               color = "orange",
               lty = "dashed")
p5 <- p4+geom_label_repel(
+     data = up,aes(log2FoldChange,log10FDR,label=Symbol),
+     nudge_x = 0.5,
+     nudge_y = 0.5,
+     color = "white",
+     alpha = 0.9,
+     point.padding = 0.8,
+     size = 5,
+     fill = "#96C93D",
+     segment.size = 0.8,
+     segment.color = "grey50",
+     direction = "y",
+     hjust = 0.5) +
+     geom_label_repel(
+         data = down,aes(log2FoldChange,log10FDR,label=Symbol),
+         nudge_x = -0.5,
+         nudge_y = 0.5,
+         color = "white",
+         alpha = 0.9,
+         point.padding = 0.8,
+         size = 5,
+         fill = "#9881F5",
+         segment.size = 0.8,
+         segment.color = "grey50",
+         direction = "y",
+         hjust = 0.5)
2021-11-09 volcano map based on RNA SEQ table (second time)

image.png
With another tutorial, start with the classification of differential genes
#Add a new column of label
data$Label = ""
#The log2fc values of differentially expressed genes were sorted from large to small, and the top 20 was selected
data <- data[order(abs(data$log2FoldChange),decreasing = T), ]
log2FoldChange.genes <- head(data$Symbol, 20)

#Log10 conversion of differential FDR
data$logQ <- -log10(data$padj)

#The p value of differentially expressed genes was analyzed from small to large row top 20
#data <- data[order(data$logQ)]

#Among the highly expressed genes, 20 with the lowest FDR were selected
data <- data[order(abs(data$padj),decreasing = T), ]
fdr.genes <- head(data$Symbol, 20)

#Set log2fc Genes and FDR Merge genes and add them to label
top20.genes <- c(as.character(log2FoldChange.genes), as.character(fdr.genes))
data$Label[match(top20.genes, data$Symbol)] <- top20.genes

p1 <- ggscatter(data, x = "log2FoldChange", y = "padj",
          color = "type", 
          palette = c("#00BA38", "#BBBBBB", "#F8766D"),
          size = 2,
          label = data$Label, 
          font.label = 8, 
          repel = T,
          xlab = "log2FoldChange", 
          ylab = "-log10(padj)") + 
  theme_base() + 
  geom_hline(yintercept = 1.30, linetype="dashed") +
  geom_vline(xintercept = c(-2,2), linetype="dashed")
2021-11-09 volcano map based on RNA SEQ table (second time)

mycolor <- c("#FF9999","#99CC00","gray80")
p21 <- p1 + scale_colour_manual(name="",values=alpha(mycolor,0.9))
p21
2021-11-09 volcano map based on RNA SEQ table (second time)

#Other color schemes:
mycolor <- c("#FF99CC","#99CC00","gray80")
p22 <- p1 + scale_colour_manual(name="",values=alpha(mycolor,0.7))
p22
2021-11-09 volcano map based on RNA SEQ table (second time)

#Continue to add the points corresponding to TOP10 gene;
#p2 <- p22+geom_point(data=up,aes(log2FoldChange,log10FDR),
                    color="#FF9999",size=3,alpha=0.9)+
  geom_point(data=down,aes(log2FoldChange,log10FDR),
             color="#7cae00",size=3,alpha=0.9)
#p2

#The expansion function sets the size of the blank area at both ends of the coordinate axis range; Mult is the "multiple" mode and add is the "additive" mode;
p3<-p2+labs(y="-log10FDR")+
  scale_y_continuous(expand=expansion(add = c(2, 0)),
                     limits = c(0, 40),
                     breaks = c(0,10,20,30,40),
                     label = c("0","10","20","30","40"))+
  scale_x_continuous(limits = c(-4, 4),
                     breaks = c(-4,-2,0,2,4),
                     label = c("-4","-2","0","2","4"))
p3
2021-11-09 volcano map based on RNA SEQ table (second time)

image.png
set.seed(007)
p4 <-p22+geom_text_repel(data=top10sig,aes(log2FoldChange,log10FDR,label=Symbol),
force=80,color="grey20",size=3,
point.padding = 0.5,hjust = 0.5,
arrow = arrow(length = unit(0.01, "npc"),
type = "open", ends = "last"),
segment.color="grey20",
segment.size=0.2,
segment.alpha=0.8,
nudge_x=0,
nudge_y=1)
p4

Third:

#The log2fc values of differentially expressed genes were sorted from large to small, and the top 10 was taken
data <- data[order(abs(data$log2FoldChange),decreasing = T), ]
top10sig <- head(data$Symbol, 10)

#The gene table of differentially expressed TOP10 was divided into up and down parts;
data$group<-as.numeric(as.matrix(data$group))
up <- filter(top10sig,group=="Up")
down <- filter(top10sig,group=="Down")



  #The 10 genes with the most significant expression differences were obtained;
top10sig <- filter(data,group!="None") %>% distinct(Symbol,.keep_all = T) %>% top_n(10,abs(data$log2FoldChange))

top10sig <- filter(data,group!="None") %>% distinct(Symbol,.keep_all = T) 

top_n <- top_n(10,data$log2FoldChange)
top10sig <- top_n(10,abs(data$log2FoldChange))

Error in UseMethod("tbl_vars") : 
  no applicable method for 'tbl_vars' applied to an object of class "c('double', 'numeric')"

restart:

library(ggpubr)
library(ggthemes)
setwd("C:\\Users\\Administrator.DESKTOP-4UQ3Q0K\\Desktop")
data <- read.table("RNA-seq - 1.txt",header = T,row.names = 1,sep="\t",check.names = F)
data$logQ <- -log10(data$padj)
ggscatter(data, x = "log2FoldChange", y = "logQ") + theme_base()
data$Group = "normal"
#Add adj P. Val less than 0.05, logfc greater than 2
#Add adj P. Genes with Val less than 0.05 and logfc less than 2 are set as significantly down regulated genes
data$Group[which( (data$padj < 0.01) & (data$log2FoldChange > 1) )] = "up"
data$Group[which( (data$padj < 0.01) & (data$log2FoldChange < -1) )] = "down"
#Draw a new volcanic map
ggscatter(data, x = "log2FoldChange", y = "logQ",
+           color = "Group") + theme_base()
2021-11-09 volcano map based on RNA SEQ table (second time)

#Add a logP boundary line (geom_hline) and a logfc boundary line (geom_vline) to the volcano map
geom_hline(yintercept = 1.30, linetype="dashed") +
geom_vline(xintercept = c(-2,2), linetype="dashed")
2021-11-09 volcano map based on RNA SEQ table (second time)

#Add a new column of label
data$Label = ""
#The log2fc values of differentially expressed genes were sorted from large to small, and the top 20 was taken
data <- data[order(abs(data$log2FoldChange),decreasing = T), ]
log2FoldChange.genes <- head(data$ID, 20)
#The p value of differentially expressed genes was analyzed from small to large row top 20
# data <- data[order(data$logQ)]
#Among the highly expressed genes, 20 with the lowest FDR were selected
data <- data[order(abs(data$logQ),decreasing = T), ]
fdr.genes <- head(data$ID, 20)
2021-11-09 volcano map based on RNA SEQ table (second time)