• R获取指定GO term和KEGG pathway的gene list基因集


    clusterProfiler没有显性的接口,但是可以直接扣取clusterProfiler里的函数。

    核心函数就是get_GO_data

    GO_DATA <- get_GO_data("org.Hs.eg.db", "BP", "SYMBOL")   

    可以看到输入的是GO数据库,选定类别,基因名字类型,输出的就是整个数据库。

    但是想调用这个函数没那么简单,得导入一系列的基础函数。

    一个常见的任务就是获取GO数据库里所有的cell cycle相关的基因,这需要从我们的基因集里移除。

    有了这个函数,我们就可以这么做了,两句R代码搞定。

    cellCycleGO <- names(GO_DATA$PATHID2NAME[grep("cell cycle|DNA replication|cell division|segregation", GO_DATA$PATHID2NAME)])
    
    cellCycleGene <- unique(unlist(GO_DATA$PATHID2EXTID[cellCycleGO]))
    
    print(length(cellCycleGene))
    

      

    library(DOSE)
    library(GOSemSim)
    library(clusterProfiler)
    library(org.Hs.eg.db)
    #
    get_GO_data <- function(OrgDb, ont, keytype) {
        GO_Env <- get_GO_Env()
        use_cached <- FALSE
    
        if (exists("organism", envir=GO_Env, inherits=FALSE) &&
            exists("keytype", envir=GO_Env, inherits=FALSE)) {
    
            org <- get("organism", envir=GO_Env)
            kt <- get("keytype", envir=GO_Env)
    
            if (org == DOSE:::get_organism(OrgDb) &&
                keytype == kt &&
                exists("goAnno", envir=GO_Env, inherits=FALSE)) {
                ## https://github.com/GuangchuangYu/clusterProfiler/issues/182
                ## && exists("GO2TERM", envir=GO_Env, inherits=FALSE)){
    
                use_cached <- TRUE
            }
        }
    
        if (use_cached) {
            goAnno <- get("goAnno", envir=GO_Env)
        } else {
            OrgDb <- GOSemSim:::load_OrgDb(OrgDb)
            kt <- keytypes(OrgDb)
            if (! keytype %in% kt) {
                stop("keytype is not supported...")
            }
    
            kk <- keys(OrgDb, keytype=keytype)
            goAnno <- suppressMessages(
                select(OrgDb, keys=kk, keytype=keytype,
                       columns=c("GOALL", "ONTOLOGYALL")))
    
            goAnno <- unique(goAnno[!is.na(goAnno$GOALL), ])
    
            assign("goAnno", goAnno, envir=GO_Env)
            assign("keytype", keytype, envir=GO_Env)
            assign("organism", DOSE:::get_organism(OrgDb), envir=GO_Env)
        }
    
        if (ont == "ALL") {
            GO2GENE <- unique(goAnno[, c(2,1)])
        } else {
            GO2GENE <- unique(goAnno[goAnno$ONTOLOGYALL == ont, c(2,1)])
        }
    
        GO_DATA <- DOSE:::build_Anno(GO2GENE, get_GO2TERM_table())
    
        goOnt.df <- goAnno[, c("GOALL", "ONTOLOGYALL")] %>% unique
        goOnt <- goOnt.df[,2]
        names(goOnt) <- goOnt.df[,1]
        assign("GO2ONT", goOnt, envir=GO_DATA)
        return(GO_DATA)
    }
    
    get_GO_Env <- function () {
        if (!exists(".GO_clusterProfiler_Env", envir = .GlobalEnv)) {
            pos <- 1
            envir <- as.environment(pos)
            assign(".GO_clusterProfiler_Env", new.env(), envir=envir)
        }
        get(".GO_clusterProfiler_Env", envir = .GlobalEnv)
    }
    
    get_GO2TERM_table <- function() {
        GOTERM.df <- get_GOTERM()
        GOTERM.df[, c("go_id", "Term")] %>% unique
    }
    
    get_GOTERM <- function() {
        pos <- 1
        envir <- as.environment(pos)
        if (!exists(".GOTERM_Env", envir=envir)) {
            assign(".GOTERM_Env", new.env(), envir)
        }
        GOTERM_Env <- get(".GOTERM_Env", envir = envir)
        if (exists("GOTERM.df", envir = GOTERM_Env)) {
            GOTERM.df <- get("GOTERM.df", envir=GOTERM_Env)
        } else {
            GOTERM.df <- toTable(GOTERM)
            assign("GOTERM.df", GOTERM.df, envir = GOTERM_Env)
        }
        return(GOTERM.df)
    }
    

      

    获取KEGG的通路和基因是一样的,也是用clusterProfiler

    代码:

    hsa_kegg <- clusterProfiler::download_KEGG("hsa")
    
    names(hsa_kegg)
    
    head(hsa_kegg$KEGGPATHID2NAME)
    
    head(hsa_kegg$KEGGPATHID2EXTID)
    
    PATH2ID <- hsa_kegg$KEGGPATHID2EXTID
    PATH2NAME <- hsa_kegg$KEGGPATHID2NAME
    PATH_ID_NAME <- merge(PATH2ID, PATH2NAME, by="from")
    colnames(PATH_ID_NAME) <- c("KEGGID", "ENTREZID", "DESCRPTION")
    
    # write.table(PATH_ID_NAME, "HSA_KEGG.txt", sep="	")
    
    library(biomaRt)
    
    mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
    entrezgene <- PATH_ID_NAME$ENTREZID
    # This step need some time
    ensembl_gene_id<- getBM(attributes=c("ensembl_gene_id", "entrezgene"),
                      filters = "entrezgene",
                           values=entrezgene , mart= mart)
    
    PATH_ID_NAME <- merge(PATH_ID_NAME, ensembl_gene_id, by.x= "ENTREZID",by.y= "entrezgene")
    

      

  • 相关阅读:
    Django models 中导入重复数据的解决办法
    Python3 re
    Django 从请求到相应的过程
    Django session global配置
    CountDownLatch和CyclicBarrier用法及区别讲解
    npm
    go 实现爬虫
    go,redis实现订阅和发布
    使用Network Emulator Toolkit工具模拟网络丢包测试
    vue+element 使用多个el-select绑定的值为同一个值
  • 原文地址:https://www.cnblogs.com/leezx/p/10819110.html
Copyright © 2020-2023  润新知