• Python 爬取外文期刊论文信息(机械 仪表工业)


    NSTL国家科技图书文献中心    2017  机械 仪表工业  所有期刊论文信息

    代码比较随意,不要介意

    第一步,爬取所有期刊链接

    #coding=utf-8
    
    import time
    from selenium import webdriver
    from lxml import etree
    from pymongo import MongoClient
    
    client = MongoClient("IP", 27017)
    db = client["nstl"]
    collection=db["journal_urls"]
    db.authenticate("","")
     
    driver = webdriver.Chrome(executable_path=r"D:chromedriver_win32chromedriver.exe")
    driver.get('https://www.nstl.gov.cn/facade/search/clcSearch.do?&lan=eng&clc=TH')
    
    html = driver.page_source
    tree = etree.HTML(html)
    count = int(tree.xpath("//span[@id='totalPages1']/text()")[0])
    
    # 共47页
    for i in range(count):
    
        html = driver.page_source
        tree = etree.HTML(html)
    
        # 提取当前页所有期刊链接并存储
        table = tree.xpath("//div[@class='s2listtd2']/span/a/@href")
        for j in table:
            bson = {}
            bson['url'] = j
            collection.insert(bson)
    
        # i等于46时终止
        if i==(count-1):
            break
    
        # 点击接下来一页按钮
        driver.find_element_by_xpath('//div[@id="page"]/div//a[text()="%s"]'%str(i+2)).click()
    
        # 判断翻页成功后跳出while
        while True:
            time.sleep(1)
            if driver.page_source!=html:
                break
    
    driver.close()

    第二步,爬取每个期刊中所有2017年论文链接

    #coding=utf-8
    import requests
    from pymongo import MongoClient
    from lxml import etree
    from selenium import webdriver
    import time
    
    client = MongoClient("IP", 27017)
    db = client["nstl"]
    collection1=db["journal_urls"]
    collection2=db["journalArticle2017_urls"]
    db.authenticate("","")
    driver = webdriver.Chrome(executable_path=r"D:chromedriver_win32chromedriver.exe")
    # 循环所有期刊链接
    for item in collection1.find({}, {"url":1, "_id":0}):
        driver.get(item['url'][29:-4])
        html = driver.page_source
        tree = etree.HTML(html)
        # 判断如果有18年论文,需要点击出17年论文
        table_2018 = tree.xpath("//div[@id='year_2018']")
        if table_2018!=[]:
            driver.find_element_by_xpath("//div[@id='year_2017']").click()
            time.sleep(1)
            driver.find_element_by_xpath("//div[@id='volumeUl_2017']/div[@class='ltreebom2']").click()
        # 获取17年期的个数并循环
        table = tree.xpath("//div[@id='volumeUl_2017']//div[@class='ltreebom3']/a")
        for i in range(1, len(table)+1):
            wen_html = driver.page_source
            wen_tree = etree.HTML(wen_html)
            # 获取当前一期的所有论文链接
            wen_table = tree.xpath("//div[@class='s2listtd2']/a/@href")
            for j in wen_table:
                bson = {}
                bson['url'] = j
                collection2.insert(bson)
            # 判断结束循环
            if i==len(table):
                break
            # 点击出下一期论文
            try:
                driver.find_element_by_xpath("//div[@id='volumeUl_2017']//div[@class='ltreebom3'][%s]"%str(i+1)).click()
            except:
                break
            # 判断是否点击成功
            while True:
                time.sleep(1)
                if driver.page_source!=wen_html:
                    break
    
    driver.close()

    第三步,爬取论文信息详情页源码

    #coding=utf-8
    import requests
    from pymongo import MongoClient
    from lxml import etree
    from selenium import webdriver
    import time
    
    client = MongoClient("IP", 27017)
    db = client["nstl"]
    collection=db["journalArticle2017_urls"]
    collection1=db["journalArticle2017_codes"]
    db.authenticate("","")
    
    driver = webdriver.Chrome(executable_path=r"D:chromedriver_win32chromedriver.exe")
    
    # 循环所有论文并构造链接
    for item in collection.find({}, {"url":1, "_id":0}):
    
        url = "https://www.nstl.gov.cn/facade/search/toFullView.do?checkedSEQNO="+item['url'][23:-11]+"&subDocType="+item['url'][-8:-3]
    
        # # post方法获取当前页源码
        # for i in range(100):
        #     try:
        #         result = requests.post(url, verify = False)
        #     except:
        #         time.sleep(1)
        #         continue
    
        #     html = result.text
        #     if html:
        #         break
        
        # 模拟浏览器获取源码, 得到含有文献数据的源码后跳出循环
        driver.get(url)
        for i in range(100):
            time.sleep(1)
            if driver.page_source!=html:
                break
    
        # 存储
        bson = {}
        html1 = driver.page_source
        bson['html'] = html1
        collection1.insert(bson)
    
    driver.close()

    第四步,解析源码

    #coding=utf-8
    from pymongo import MongoClient
    from lxml import etree
    
    client = MongoClient("IP", 27017)
    db = client["nstl"]
    collection1 = db["journalArticle2017_codes"]
    collection2 = db["journalArticle2017_data"]
    db.authenticate("","")
    
    zzdw, km, ma, cbn, j, q, qy, zy, zys, flh, gjc, yz, wz = u'【作者单位】:', u'【刊名】:', u'【ISSN】:', u'【出版年】:', u'【卷】:', u'【期】:', u'【起页】:', u'【止页】:', u'【总页数】:', u'【分类号】:', u'【关键词】:', u'【语种】:', u'【文摘】:'
    
    # 循环所有论文并构造链接
    n = 0
    for item in collection1.find({}, {"html":1, "_id":0}):
        html = item["html"]
        tree = etree.HTML(html)
    
        title = tree.xpath("//span[@name='title']/text()")
        author = tree.xpath("//a[starts-with(@href,'javascript:searchByAuthor')]/text()")
        organization = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%zzdw)
        journal_name = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%km)
        issn = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%ma)
        publication_year = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%cbn)
        volume = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%j)
        issue = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%q)
        page_start = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%qy)
        page_end = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%zy)
        page_count = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%zys)
        clc = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%flh)
        keywords = tree.xpath("//div[text()='%s']/following-sibling::*/span/a/text()"%gjc)
        language = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%yz)
        summary = tree.xpath("//div[text()='%s']/following-sibling::*/text()"%wz)
    
        dc = {}
        dc['title'] = title[0]
        if author: dc['author'] = author
        if organization: dc['organization'] = organization[0]
        if journal_name: dc['journal_name'] = journal_name[0]
        if issn: dc['issn'] = issn[0]
        if publication_year: dc['publication_year'] = publication_year[0]
        if volume: dc['volume'] = volume[0]
        if issue: dc['issue'] = issue[0]
        if page_start: dc['page_start'] = page_start[0]
        if page_end: dc['page_end'] = page_end[0]
        if page_count: dc['page_count'] = page_count[0]
        if clc: dc['clc'] = clc[0]
        if keywords: dc['keywords'] = keywords[0]
        if language: dc['language'] = language[0]
        if summary: dc['summary'] = summary[0]
    
        collection2.insert(dc)
  • 相关阅读:
    Guzz入门教程
    设计模式开题
    纪录idea不能创建class类问题(Cannot Create Class)
    dbrouter实现流程图
    记录一次concurrent mode failure问题排查过程以及解决思路
    程序员的自我修养
    CyclicBarrier之共享锁的理解
    sed选项详解(options)
    sed 范围查找
    Sed命令
  • 原文地址:https://www.cnblogs.com/zhangtianyuan/p/9199324.html
Copyright © 2020-2023  润新知