用david搞定了所有的GOterm后,接下来就是利用goslim处理这些term。
用的包是goatools,需要下载几个obo文件以及,fisher,pygraphviz以及graphviz等几个模块。
# -*- coding: utf-8 -*- """ Created on Fri Nov 21 20:06:42 2014 @author: hluo """ #!/usr/bin/python # -*- coding: UTF-8 -*- import os import os.path as op import sys sys.path.insert(0, op.join(op.dirname(__file__), "..")) from obo_parser import GODag from mapslim import mapslim import json import re #from goatools.mapslim import mapslim # copied from find_enrichment.pydef get_goslim(term, godag, goslimdag): if not term in godag: return [None, None] else: direct_anc, all_anc = mapslim(term, godag, goslimdag) return [direct_anc, all_anc] def mygofun(json_file): obo_file = '/home/hluo/Desktop/goslim/go-basic.obo' assert os.path.exists(obo_file), "file %s not found!" % obo_file slim_obo_file = '/home/hluo/Desktop/goslim/goslim_generic.obo' assert os.path.exists(slim_obo_file), "file %s not found!" % slim_obo_file # load DAGs go_dag = GODag(obo_file) goslim_dag = GODag(slim_obo_file) #json_file = 'NC_000913.gbk.json' myrecord = json.load(open(json_file)) #re_obj = re.compile(r'GO:d+') re_obj = re.compile(r'(?<=$)S+(?=~)') #a = re.match(re_Obj, text) #mylist = [] for item in myrecord: mykey = filter(lambda x: x.startswith('GO'), item.keys()) for k in mykey: tplist = [set(), set()] for text in item[k]: tplist1 = [] goterm = re.findall(re_obj, text)[0] tplist1 = get_goslim(goterm, go_dag, goslim_dag) tplist[0] |= tplist1[0] tplist[1] |= tplist1[1] item[k + '_dslim'] = list(tplist[0]) item[k + '_aslim'] = list(tplist[1]) json.dump(myrecord, open('%s.txt' % json_file, 'w'), indent = 1) if __name__ == '__main__': mygofun(NC_000913.gbk.json)
the script loads the json format file, and adds the new keys to every record in the file.
ps. dslim: direct slim. aslim: all slim.
Then I run a python batch script to process all the json files.
# -*- coding: utf-8 -*- """ Created on Mon Nov 24 17:37:24 2014 @author: hluo """ import os import reimport sys from mygoslim import mygofun if __name__ == '__main__': mydir = '/home/hluo/Desktop/gbk' flist = os.listdir(mydir) re_obj = re.compile('.json$') #re_obj1 = re.compile('NC_d{6}.gbk') json_file_list = [] for item in flist: re_item = re.findall(re_obj, item) if re_item: json_file_list.append(item) for item in json_file_list: mygofun('%s/%s' % (mydir, item))
In the script, use the 're' and 'os' module to get all the json files.