• python 爬取能源网


    import requests
    import re
    import time
    from bs4 import BeautifulSoup
    import csv
    import xlrd
    from xlutils.copy import copy
    import random
    ##屏蔽https错误
    requests.packages.urllib3.disable_warnings()
    class Spider():
    
        def __init__(self,keyworks):
    
            self.kw = keyworks
            self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
            "Cookie":"D3z_vi-ds=f1f6d61ffd02c29c1cd832a363888be3; __jsluid_s=0b360d705e0e333a682280ae3b03bf90; Hm_lvt_c909c1510b4aebf2db610b8d191cbe91=1655284406; Hm_lpvt_c909c1510b4aebf2db610b8d191cbe91=1655285546",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
            }
            self.csv_name = "123.csv"
            self.url = "https://www.dlzb.com/zb/search.php?kw="+str(self.kw)
            self.path = "123.xls"
    
        def getContent(self,nextUrl):
            res = requests.get(nextUrl, headers=self.headers, verify=False) #https访问
            return res
            pass
    
        #写入csv
        def writeXlx(self,title,kw):
            data = [title,kw]
            f = open(self.csv_name, 'a+', newline='', encoding='utf-8')
            # 2. 基于文件对象构建 csv写入对象
            csv_writer = csv.writer(f)
            # csv_writer.writerow(["作者", '摘要'])
            # 3. 构建列表头
            csv_writer.writerow(data)
            f.close()
            pass
    
        def getPage(self,p,total):
    
            for num in range(total):
                # if num == 1:
                #     break
                nextUrl = self.url+"&page="+str(num+p)
                print(nextUrl)
                #获取内容
                content = self.getContent(nextUrl)
                #print(content.text)
                # re.S匹配多行,包括换行符\n
                res = re.findall(r'<ul class=\"gclist_ul listnew\">(.*?)<\/ul>', content.text, re.S)
                if res:
                    titleList = re.findall(r'<a class=\"gccon_title\".*?>(.*?)<\/a>', res[0], re.S)
                    data = []
                    for title in titleList:
                        soup = BeautifulSoup(title, 'html.parser')#过滤html标签
                        print(str(self.kw) +"-第"+str(num+p)+""+str(),soup.get_text())
    
                        data.append([soup.get_text(),self.kw])
                    self.writeXLSAppend(data)
    
                    time.sleep(random.uniform(1, 10))
                else:
                    print(self.kw,"end*************")
                    print(content.text)
                    print(res)
                    break
            pass
    
    
        def writeXLSAppend(self,value):
            index = len(value)  # 获取需要写入数据的行数
            workbook = xlrd.open_workbook(self.path)  # 打开工作簿
            sheets = workbook.sheet_names()  # 获取工作簿中的所有表格
            worksheet = workbook.sheet_by_name(sheets[0])  # 获取工作簿中所有表格中的的第一个表格
            rows_old = worksheet.nrows  # 获取表格中已存在的数据的行数
            new_workbook = copy(workbook)  # 将xlrd对象拷贝转化为xlwt对象
            new_worksheet = new_workbook.get_sheet(0)  # 获取转化后工作簿中的第一个表格
            for i in range(0, index):
                for j in range(0, len(value[i])):
                    new_worksheet.write(i + rows_old, j, value[i][j])  # 追加写入数据,注意是从i+rows_old行开始写入
            new_workbook.save(self.path)  # 保存工作簿
    
            print("xls格式表格【追加】写入数据成功!")
    if __name__ == '__main__':
        #https://www.dlzb.com/zb/search.php?kw=%E6%99%BA%E8%83%BD%E5%AE%A2%E6%9C%8D
        list = ['人工智能','知识图谱','计算机视觉','图像识别','文本挖掘','文本分析','知识问答','神经网络']
    
        for x in list:
    
            print(x)
            ##页数default=1,多少条(500)
            spider = Spider(x).getPage(1,1000) ##
  • 相关阅读:
    Genymotion安卓模拟器和VirtualBox虚拟机安装、配置、测试(win7_64bit)
    jQuery UI (15)图标
    CSS(01)CSS层叠样式表
    Jquery(12)Jquery Ajax跨域访问
    Jquery(13)Jquery this的指向
    Jquery(18)Jquery Cookie操作
    EntityFramework报错
    单例模式应用
    FormCollection的用法
    $.ajax、$.post[转]
  • 原文地址:https://www.cnblogs.com/wtcl/p/16381736.html
Copyright © 2020-2023  润新知