• Python 爬基金数据


    爬科学基金共享服务网中基金数据

    #coding=utf-8
    import json
    import requests
    from lxml import etree
    from HTMLParser import HTMLParser
    from pymongo import MongoClient
    
    data = {'pageSize':10,'currentPage':1,'fundingProject.projectNo':'','fundingProject.name':'','fundingProject.person':'','fundingProject.org':'',
    'fundingProject.applyCode':'','fundingProject.grantCode':'','fundingProject.subGrantCode':'','fundingProject.helpGrantCode':'','fundingProject.keyword':'',
    'fundingProject.statYear':'','checkCode':'%E8%AF%B7%E8%BE%93%E5%85%A5%E9%AA%8C%E8%AF%81%E7%A0%81'}
    url = 'http://npd.nsfc.gov.cn/fundingProjectSearchAction!search.action'
    headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate',
    'Accept-Language':'zh-CN,zh;q=0.9',
    'Cache-Control':'max-age=0',
    'Connection':'keep-alive',
    'Content-Length':'340',
    'Content-Type':'application/x-www-form-urlencoded',
    'Cookie':'JSESSIONID=8BD27CE37366ED8022B42BFC68FF82D4',
    'Host':'npd.nsfc.gov.cn',
    'Origin':'http://npd.nsfc.gov.cn',
    'Referer':'http://npd.nsfc.gov.cn/fundingProjectSearchAction!search.action',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
    
    def main():
        client = MongoClient('localhost', 27017)
        db = client.ScienceFund
        db.authenticate("","")
        collection=db.science_fund
        for i in range(1, 43184):
            print i
            data['currentPage'] = i
            result = requests.post(url, data = data, headers = headers)
            html = result.text
            tree = etree.HTML(html)
            table = tree.xpath("//dl[@class='time_dl']")
            for item in table:
                content = etree.tostring(item, method='html')
                content =  HTMLParser().unescape(content)
                # print content
                bson = jiexi(content)
                collection.insert(bson)
    
            
    def jiexi(content):
        # 标题
        title1 = content.find('">', 20)
        title2 = content.find('</')
        title = content[title1+2:title2]
        # print title
        # 批准号
        standard_no1 = content.find(u'批准号', title2)
        standard_no2 = content.find('</dd>', standard_no1)
        standard_no = content[standard_no1+4:standard_no2].strip()
        # print standard_no
        # 项目类别
        standard_type1 = content.find(u'项目类别', standard_no2)
        standard_type2 = content.find('</dd>', standard_type1)
        standard_type = content[standard_type1+5:standard_type2].strip()
        # print standard_type
        # 依托单位
        supporting_institution1 = content.find(u'依托单位', standard_type2)
        supporting_institution2= content.find('</dd>', supporting_institution1)
        supporting_institution = content[supporting_institution1+5:supporting_institution2].strip()
        # print supporting_institution
        # 项目负责人
        project_principal1 = content.find(u'项目负责人', supporting_institution2)
        project_principal2 = content.find('</dd>', project_principal1)
        project_principal = content[project_principal1+6:project_principal2].strip()
        # print project_principal
        # 资助经费
        funds1 = content.find(u'资助经费', project_principal2)
        funds2 = content.find('</dd>', funds1)
        funds = content[funds1+5:funds2].strip()
        # print funds
        # 批准年度
        year1 = content.find(u'批准年度', funds2)
        year2 = content.find('</dd>', year1)
        year = content[year1+5:year2].strip()
        # print year
        # 关键词
        keywords1 = content.find(u'关键词', year2)
        keywords2 = content.find('</dd>', keywords1)
        keywords = content[keywords1+4:keywords2].strip()
        # print keywords
        dc = {}
        dc['title'] = title
        dc['standard_no'] = standard_no
        dc['standard_type'] = standard_type
        dc['supporting_institution'] = supporting_institution
        dc['project_principal'] = project_principal
        dc['funds'] = funds
        dc['year'] = year
        dc['keywords'] = keywords
        return dc
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    Wide character in print at a2.pl line 返回json 需要encode_utf8
    decode_json 必须是unicode形式的字符
    Wide character in print at a2.pl line 6.
    unicode转中文
    用 Flask 来写个轻博客 (4) — (M)VC_创建数据模型和表
    Openstack_通用模块_Oslo_vmware 创建 vCenter 虚拟机快照
    为什么企业数据化运营很重要?
    为什么企业数据化运营很重要?
    Openstack_单元测试工具 tox
    java 把已知下载路径的文件复制到本地
  • 原文地址:https://www.cnblogs.com/zhangtianyuan/p/8482255.html
Copyright © 2020-2023  润新知