0. Install BiomaRt and set seed (so results can be reproduced)

#source("https://bioconductor.org/biocLite.R")
#biocLite("biomaRt")
setwd("/home/beissinger/Documents/MESH_Maize/Manuscript/Supplemental Data/")
set.seed(651465) #I randomly hammered on my keyboard for this number. It makes the code reproducible, but can be changed for a different random  number generator.

1. Create a vector of background genes

We first create a vector of background genes. We will use every gene with the required data (entrez id) as the background.

library(biomaRt)
## access to biomaRt
mart <- useMart(biomart = "plants_mart", host="plants.ensembl.org", dataset="zmays_eg_gene")
univ.geneID <- getBM(attributes=c("ensembl_gene_id", "entrezgene"), mart = mart) # 40481
## remove genes with no corresponding Entrez Gene ID
univ.geneID2 <- univ.geneID[!is.na(univ.geneID[,2]),] # 14142 
## remove duplicated Entrez Gene ID
univ.geneID3 <- univ.geneID2[ !duplicated(univ.geneID2[,2]),] # 13630
##Get GO terms
univ.geneID4<-getBM(attributes=c("entrezgene","go_accession","go_name_1006","go_namespace_1003","go_linkage_type"),mart=mart,filters='entrezgene',values=univ.geneID3$entrezgene)
## Code evidence for genes without GO terms as NA
univ.geneID5<-univ.geneID4[-which(univ.geneID4[,2]==""),]
###Make dataframe for GOStats
goframeData <- data.frame(go_id = univ.geneID5$go_accession, Evidence = univ.geneID5$go_linkage_type, gene_id = univ.geneID5$entrezgene,stringsAsFactors=F)

2. Create a vector of selected genes

Secondly, we create a vector of significant genes by randomly choosing them! Since the other datasets were based on AGPv2, we again use an archived gene build of AGPv2 to download appropriate data.

## read data
allGenes<-read.table("ftp://ftp.gramene.org/pub/gramene/maizesequence.org/release-5b/filtered-set/ZmB73_5b_FGS_info.txt",header=T,stringsAsFactors=F)
allGenes$chromosome <- substr(allGenes$chromosome,4,6)
allGenes$chromosome <- as.numeric(allGenes$chromosome)
allGenes <- allGenes[which(is.na(allGenes$chromosome)==F),]
allGenes <- allGenes[!duplicated(allGenes$gene_id),] # only work with first transcript


my.geneID<-allGenes[sample(nrow(allGenes),1500),] #randomly sample genes

colnames(my.geneID)[2] <- "ensembl_gene_id"
## merge two files
my.geneID2 <- merge(my.geneID, univ.geneID3, by ="ensembl_gene_id")
## remove duplicated Entrez Gene ID
my.geneID3 <- my.geneID2[ !duplicated(my.geneID2$entrezgene),]

3. GO enrichment analysis

We perform a GO analysis using the GOstats package. This time we do refer to GO results in the manuscript, we the code below was run and results are part of this Rmarkdown file.

#source("https://bioconductor.org/biocLite.R")
#biocLite("GOstats")
#biocLite("GOSemSim")
#biocLite("AnnotationForge")
#library("AnnotationForge")
#available.dbschemas() #maize is not available :-(
library("GOstats")
library("GOSemSim")
##Prepare GO to gene mappings
goFrame=GOFrame(goframeData,organism="Zea mays")
goAllFrame=GOAllFrame(goFrame)
library(GSEABase)
gsc <- GeneSetCollection(goAllFrame, setType = GOCollection())

params <- GSEAGOHyperGParams(name="Domestication Zea mays GO", geneSetCollection=gsc, geneIds = my.geneID3$entrezgene,
                             universeGeneIds = univ.geneID5$entrezgene, ontology = "BP", pvalueCutoff = 0.05, conditional = TRUE,
                             testDirection = "over")

GO enrichment analysis for BP

BP <- hyperGTest(params)
summary(BP)[,c(1,2,7)] #37
##        GOBPID      Pvalue
## 1  GO:0002683 0.001440028
## 2  GO:0046470 0.005345651
## 3  GO:0006432 0.008211351
## 4  GO:0003333 0.011707286
## 5  GO:0016226 0.016963223
## 6  GO:0006820 0.017096016
## 7  GO:0016246 0.020381452
## 8  GO:0071705 0.024739080
## 9  GO:0031167 0.026643190
## 10 GO:0000724 0.029044180
## 11 GO:0009267 0.029424742
## 12 GO:0016441 0.031819309
## 13 GO:0009658 0.031848398
## 14 GO:0016559 0.034644233
## 15 GO:0006817 0.034644233
## 16 GO:0006835 0.034644233
## 17 GO:0006399 0.037857230
## 18 GO:0010342 0.038011696
## 19 GO:0008616 0.038011696
## 20 GO:0019433 0.038011696
## 21 GO:0009440 0.038011696
## 22 GO:0010529 0.038011696
## 23 GO:0046461 0.038011696
## 24 GO:0046473 0.038011696
## 25 GO:0045824 0.038011696
## 26 GO:0046503 0.038011696
## 27 GO:0098755 0.038011696
## 28 GO:0050687 0.038011696
## 29 GO:0010028 0.038011696
## 30 GO:0007143 0.038011696
## 31 GO:0032504 0.038664724
## 32 GO:0051567 0.038699873
## 33 GO:0010167 0.043442764
## 34 GO:0046942 0.045526522
##                                                       Term
## 1             negative regulation of immune system process
## 2                    phosphatidylcholine metabolic process
## 3                         phenylalanyl-tRNA aminoacylation
## 4                       amino acid transmembrane transport
## 5                             iron-sulfur cluster assembly
## 6                                          anion transport
## 7                                         RNA interference
## 8                              nitrogen compound transport
## 9                                         rRNA methylation
## 10 double-strand break repair via homologous recombination
## 11                         cellular response to starvation
## 12                      posttranscriptional gene silencing
## 13                                chloroplast organization
## 14                                      peroxisome fission
## 15                                 phosphate ion transport
## 16                             dicarboxylic acid transport
## 17                                  tRNA metabolic process
## 18                               endosperm cellularization
## 19                          queuosine biosynthetic process
## 20                          triglyceride catabolic process
## 21                               cyanate catabolic process
## 22                    negative regulation of transposition
## 23                         neutral lipid catabolic process
## 24                     phosphatidic acid metabolic process
## 25           negative regulation of innate immune response
## 26                          glycerolipid catabolic process
## 27            maintenance of seed dormancy by absisic acid
## 28        negative regulation of defense response to virus
## 29                                       xanthophyll cycle
## 30                                 female meiotic division
## 31                     multicellular organism reproduction
## 32                               histone H3-K9 methylation
## 33                                     response to nitrate
## 34                               carboxylic acid transport
# GO similarity
library(corrplot)
goListBP <- summary(BP)[,c(1)]
goSimMatBP <- mgoSim(goListBP, goListBP, ont="BP", measure="Wang", combine=NULL)
corrplot(goSimMatBP, is.corr = FALSE, type="lower", tl.col = "black", tl.cex = 0.8)