#coding=utf-8 from pymongo import MongoClient from lxml import etree import requests jigou = u" 【机构】 " zuozhe = u" 【作者】 " # 获取数据库 def get_db(): client = MongoClient('localhost', 27017) db = client.cnki db.authenticate("用户名","密码") return db # 获取第num条数据 def get_data(table, num): i = 1 for item in table.find({}, {"html":1,"_id":0}): if i==num: if item.has_key('html') and item['html']: return item['html'] else: i+=1 continue # 列表首元素转字符串 def list_str(list): if len(list)!=0: return list[0] else: return "" # 作者英文名,机构英文名 def en_ls(list, length1, length2): if len(list)!=0: list = list[0].replace(u"【Author】","").replace(" ","").strip().split(";") if len(list)==(length2+length1)+1: return list2str(list[:length1]), list2str(list[length1:-1]) else: return "", "" else: return "", "" def hyxx(list): if len(list)!=0: hylmc,hymc,hysj,hydd,flh,zbdw = "","","","",[],"" for item in list: if u"【会议录名称】" in item: hylmc = item.replace(u"【会议录名称】","").replace(" ","").strip() continue if u"【会议名称】" in item: hymc = item.replace(u"【会议名称】","").replace(" ","").strip() continue if u"【会议时间】" in item: hysj = item.replace(u"【会议时间】","").replace(" ","").strip() continue if u"【会议地点】" in item: hydd = item.replace(u"【会议地点】","").replace(" ","").strip() continue if u"【分类号】" in item: flh = item.replace(u"【分类号】","").replace(" ","").strip() continue if u"【主办单位】" in item: zbdw = item.replace(u"【主办单位】","").replace(u"、",";").replace(" ","").strip() continue return hylmc,hymc,hysj,hydd,flh,zbdw else: return "","","","","","" # 列表转字符串 def list2str(list): if len(list)!=0: return ";".join(list) else: return "" # 构造论文入库字典 def standard_dict(html): dc = {} print 1 # print html tree = etree.HTML(html) # 论文名称 dc["title"] = list_str(tree.xpath("//span[@id='chTitle']/text()")) # 外文名称 dc["title_eng"] = list_str(tree.xpath("//span[@id='enTitle']/text()")) # 作者 dc["author"] = list2str(tree.xpath("//p[text()='%s']/a/text()"%zuozhe)) # 作者数量 length1 = len(tree.xpath("//p[text()='%s']/a/text()"%zuozhe)) # 机构名称 dc["organization"] = list2str(tree.xpath("//p[text()='%s']/a/text()"%jigou)) # 机构数量 length2 = len(tree.xpath("//p[text()='%s']/a/text()"%jigou)) # 作者英文名, 机构英文名 dc["author_eng"], dc["organization_eng"] = en_ls(tree.xpath("//p[@id='au_en']/text()"), length1, length2) # 摘要 dc["summary"] = list_str(tree.xpath("//span[@id='ChDivSummary']/text()")) # 英文摘要 dc["summary_eng"] = list_str(tree.xpath("//span[@id='EnChDivSummary']/text()")) # 关键词 dc["keywords"] = list2str(tree.xpath("//div[@class='keywords']/span[1]/a/text()")) # 英文关键词 dc["keywords_eng"] = list2str(tree.xpath("//div[@class='keywords']/span[2]/a/text()")) # 会议信息 dc["proceeding_title"],dc["conference_title"],dc["conference_date"],dc["conference_place"],dc["huiyflh"],dc["conference_org"] = hyxx(tree.xpath("//div[@class='summary']/ul/li/text()")) if dc["proceeding_title"]=="": print 2 dc["proceeding_title"] = list_str(tree.xpath("//div[@class='summary']/ul[1]/li/a/text()")) return dc # 主函数 def main(): db = get_db() collection=db.conference collection2 = db.conference_cleaned for item in collection.find({}, {"html":1,"_id":0}): if item.has_key('html') and item['html']: dc = standard_dict(item['html']) collection2.insert(dc) if __name__ == '__main__': main() # 以下代码用于测试清洗特定一条数据 # db = get_db() # collection=db.conference # data = get_data(collection, 1) # dc = standard_dict(data) # for k,v in dc.items(): # print k,v