• 爬虫百度学术


    import requests
    from bs4 import BeautifulSoup
    import re
    from lxml import etree
    import time
    import csv
    
    requests.packages.urllib3.disable_warnings()
    #需要生成的cs名字
    csv_name = "123.csv"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
    }
    
    
    
    """1 第一步"""
    # 获取要爬取的分页, 当前第几页,总共要多少条
    def get_page_total(p1,total):
    
        for x in range(p1,total):
            p1=str(x*10)
            url = "https://xueshu.baidu.com/s?wd=journaluri%3A%2820bd239813882ced%29%20applied%20energy&pn="+p1+"&tn=SE_baiduxueshu_c1gjeupa&ie=utf-8&sc_hit=1"
            #print(url)
            print("当前第"+str(x)+"页,共需要"+str(total)+"")
            get_url(url)
            time.sleep(1)
            #print(x)
    
    
    """2 第二步"""
    #获取url文章链接地址,-》跳转到详情页
    """
    content = ['https://xueshu.baidu.com/usercenter/paper/show?paperid=e04cdee2122f75b0011cc9e7b452d72b&site=xueshu_se',
    'https://xueshu.baidu.com/usercenter/paper/show?paperid=9ccc121c6260e006c41c32f04ddf2e85&site=xueshu_se'] ...
    """
    def get_url(url):
    
        r = requests.get(url, headers=headers, verify=False)
        html = r.text
        selector = etree.HTML(html)
        content = selector.xpath('//h3[@class="t c_font"]//a/@href')
    
        ##获取内容详情,百度文库每页是十条,
        get_page_content(content)
    
    
    """3"""
    #获取内容详情,百度文库每页是十条,
    def get_page_content(detail_url):
    
        for link in detail_url:
            data = []
            #print(link)
            rr = requests.get(link, headers=headers,verify=False)
    
            selector = etree.HTML(rr.text)
            #作者
            zuozhe = selector.xpath('//p[@class="author_text"]//span//text()')
            #摘要
            zhaiyao = selector.xpath('//p[@class="abstract"]//text()')
    
            # print(zuozhe)
            # print(zhaiyao)
            data.append(','.join(zuozhe))
            data.append(','.join(zhaiyao))
    
            #print(data)
            print("开始写入csv")
            f_csv(data)
            time.sleep(1)
    
        pass
    
    """4"""
    #写入csv ["111", "222"]
    
    def f_csv(data):
    
        f = open(csv_name, 'a+', newline='', encoding='utf-8')
        # 2. 基于文件对象构建 csv写入对象
        csv_writer = csv.writer(f)
        #csv_writer.writerow(["作者", '摘要'])
        # 3. 构建列表头
        csv_writer.writerow(data)
        f.close()
        pass
    
    
    """run 爬虫"""
    
    #生成csv头部
    csv_head = ["作者","摘要"]
    
    #print(csv_head)
    f_csv(csv_head)
    
    #获取每篇文章url # 获取要爬取的分页, 当前第几页,总共要多少条
    
    get_page_total(0,1)
    
    #datas = [['M Poeschl', 'S Ward', 'P Owende'],['The energy efficiency of, different, biogas systems'] ]
    #
    # print(','.join(datas[0]))
    # print(','.join(datas[1]))
    #f_csv(datas)
  • 相关阅读:
    事件委托应用:在父控件中创建子控件,并接收值
    填充树节点
    JAVA Eclipse如何安装Swing
    JAVA Eclipse开发Android如何设置滚动条最大值最小值
    JAVA Eclipse开发Android如何让屏幕保持为竖直或水平状态
    JAVA Eclipse开发Android如何让超出界面的部分自动显示滚动条
    JAVA Eclipse开发Android程序如何自定义图标
    JAVA Eclipse开发Android程序会经常闪退是怎么回事
    JAVA Eclipse的Android文件结构是怎么样的
    JAVA Eclipse的Android的进程和生命周期是什么
  • 原文地址:https://www.cnblogs.com/wtcl/p/15831406.html
Copyright © 2020-2023  润新知