2016年
#coding=utf-8 import re from pymongo import MongoClient client = MongoClient("localhost", 27017) db = client["nosta"] collection1 = db["nosta_2016"] collection2 = db["2016_list"] db.authenticate("zty","zty") n = 0 for item in collection1.find({}, {"project_name":1, "project_content":1, "_id":0}): n += 1 print n if item['project_content'].has_key(u'主要完成人'): ls = item['project_content'][u'主要完成人'] if ls: for line in ls: # print line matchObj1 = re.search( ur'(姓名:.*?) .*', line) matchObj2 = re.search( ur'.* (行政职务:.*?) .*', line) matchObj3 = re.search( ur'.* (技术职称:.*?) .*', line) matchObj4 = re.search( ur'.* (工作单位:.*?) .*', line) matchObj5 = re.search( ur'.* (对本项目技术创造性贡献:.*?) .*', line) matchObj6 = re.search( ur'.* (对本项目主要学术贡献:.*?) .*', line) matchObj7 = re.search( ur'.* (曾获国家科技奖励情况:.*)', line) dc = {} dc['project_name'] = item['project_name'] dc['name'] = matchObj1.group(1) if matchObj1 else '' dc['duty'] = matchObj2.group(1) if matchObj2 else '' dc['title'] = matchObj3.group(1) if matchObj3 else '' dc['unit'] = matchObj4.group(1) if matchObj4 else '' dc['contribution'] = matchObj5.group(1) if matchObj5 else '' if dc['contribution']=='': dc['contribution'] = matchObj6.group(1) if matchObj6 else '' dc['award'] = matchObj7.group(1) if matchObj7 else '' # for k, v in dc.items(): # print k, v collection2.insert(dc)
2017、2018年
#coding=utf-8 import re from pymongo import MongoClient client = MongoClient("localhost", 27017) db = client["nosta"] collection1 = db["nosta_2017"] collection2 = db["2017_list"] db.authenticate("zty","zty") n = 0 for item in collection1.find({}, {"project_name":1, "project_content":1, "_id":0}): n += 1 print n if item['project_content'].has_key(u'主要完成人:'): choice = item['project_content'][u'主要完成人:'] if choice == []: continue ls = choice.split(u'姓名:')[1:] for line in ls: line = line.replace(u'排名:',u' 排名:') line = line.replace(u'行政职务:',u' 行政职务:') line = line.replace(u'技术职称:',u' 技术职称:') line = line.replace(u'工作单位:',u' 工作单位:') line = line.replace(u'完成项目时所在单位:',u' 完成项目时所在单位:') line = line.replace(u'对本项目技术创造性贡献:',u' 对本项目技术创造性贡献:') line = line.replace(u'对本项目主要学术贡献:',u' 对本项目主要学术贡献:') line = line.replace(u'曾获国家科技奖励情况:',u' 曾获国家科技奖励情况:') line = u'姓名:' + line # print line matchObj1 = re.search( ur'(姓名:.*?) .*', line) matchObj2 = re.search( ur'.* (行政职务:.*?) .*', line) matchObj3 = re.search( ur'.* (技术职称:.*?) .*', line) matchObj4 = re.search( ur'.* (工作单位:.*?) .*', line) matchObj5 = re.search( ur'.* (对本项目技术创造性贡献:.*?) .*', line) matchObj6 = re.search( ur'.* (对本项目主要学术贡献:.*?) .*', line) matchObj7 = re.search( ur'.* (曾获国家科技奖励情况:.*)', line) dc = {} dc['project_name'] = item['project_name'] dc['name'] = matchObj1.group(1) if matchObj1 else '' dc['duty'] = matchObj2.group(1) if matchObj2 else '' dc['title'] = matchObj3.group(1) if matchObj3 else '' dc['unit'] = matchObj4.group(1) if matchObj4 else '' dc['contribution'] = matchObj5.group(1) if matchObj5 else '' if dc['contribution']=='': dc['contribution'] = matchObj6.group(1) if matchObj6 else '' dc['award'] = matchObj7.group(1) if matchObj7 else '' # for k, v in dc.items(): # print k, v collection2.insert(dc)