• 网DAI之家简单爬取


    用requests和bs做个简单的爬取网DAI之家的例子。

    只做笔记用。

    
    
    #!/usr/bin/python3

    import requests
    from bs4 import BeautifulSoup
    import re
    import xlwt

    class wdzj_spider:
    pingTaiInfo = []

    def request(self, url):
    headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
    return requests.get(url, headers=headers)

    def saveToExcel(self, infoList, fileName='resultme.xls', sheetName='www.wdzj.com'):

    # 对于要写入的每一个元素,先找他的title在哪列,写入到对应的列去
    rowNo = 1
    excelTitle = {}

    book = xlwt.Workbook() # 打开一个excel
    if book is None:
    print("创建文件{0}失败".format(fileName))
    return None
    sheet = book.add_sheet(sheetName) # 根据顺序获取sheet
    if sheet is None:
    print("创建表单{0}失败".format(sheetName))
    return None

    for info in infoList:
    for item in info.items():
    if str(item[0]) in excelTitle:
    colNo = excelTitle[item[0]]
    else:
    colNo = len(excelTitle)
    excelTitle[str(item[0])] = colNo
    sheet.write(rowNo, colNo, str(item[1]))
    rowNo = rowNo + 1
    for m in excelTitle.items():
    sheet.write(0, m[1], m[0])
    book.save(fileName)
    return rowNo

    def getDataplaneFromPage(self, Link):
    mainHtml = self.request(Link)
    dataInfo = {}
    #默认编码有问题,需手动转码
    txtUTF8 = str(mainHtml.content, 'utf-8')
    mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8")
    lis = mainBS.find_all('li', attrs={'class':'normal'})
    for tr in lis:
    divs = tr.find_all('div')
    dataInfo[divs[1].text.strip()] = divs[0].text.strip()
    # print("数据={0}".format(dataInfo))
    return dataInfo

    def getGongshangFromPage(self, Link):
    mainHtml = self.request(Link)
    gongshangInfo = {}
    #默认编码有问题,需手动转码
    txtUTF8 = str(mainHtml.content, 'utf-8')
    mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8")
    trs = mainBS.find('div', attrs={'class':'lcen'}).find_all('tr')
    for tr in trs:
    tdTitles = tr.find_all('td', attrs={'class':'t'})
    tds = tr.find_all('td')
    index = 1
    for td in tdTitles:
    gongshangInfo[td.text] = tds[index].text.strip(' ')
    index = index + 2
    # print("工商信息={0}".format(gongshangInfo))
    return gongshangInfo

    def getLinkFromPage(self, pingtaiName, pingtaiLink):
    shujuInfo = {}
    lianxifangshiInfo = {}
    pingtaiHtml = self.request(pingtaiLink)
    #默认编码有问题,需手动转码
    txtUTF8 = str(pingtaiHtml.content, 'utf-8')
    mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8")
    briefText = mainBS.find("div", class_="cen-zk").get_text().strip(' ')
    briefInfo={"P2P平台名称":pingtaiName, "简介":briefText}
    # print("简介={0}".format(briefText))
    gongshangLink = 'https://'+mainBS.find('a', text='工商/备案')['href'].lstrip('/')
    dataA = mainBS.find('div', attrs={'class':'common-header-nav'}).find('a', text='数据')

    lianxifangshiTitles =mainBS.find('div', attrs={'class':"da-lxfs zzfwbox"}).find_all('div', class_='l')
    lianxifangshiContents = mainBS.find('div', attrs={'class': "da-lxfs zzfwbox"}).find_all('div', class_='r')

    for i in range(0, len(lianxifangshiTitles)):
    lianxifangshiInfo[lianxifangshiTitles[i].get_text().strip(' ')] = lianxifangshiContents[i].get_text().strip(' ')
    # print("联系方式={0}".format(lianxifangshiInfo))

    if dataA:
    dataLink = 'https://' + dataA['href'].lstrip('/')
    shujuInfo = self.getDataplaneFromPage(dataLink)

    gongshangInfo = self.getGongshangFromPage(gongshangLink)
    self.pingTaiInfo.append({**briefInfo, **gongshangInfo, **shujuInfo, **lianxifangshiInfo})

    def getAllPage(self):
    startUrl = 'https://www.wdzj.com/dangan/search?filter=&sort=1&currentPage=1'
    mainHtml = self.request(startUrl)
    pageStr = BeautifulSoup(mainHtml.text, "lxml").find("span", class_="all").text
    searchObj = re.search(r'1/([0-9]+)', pageStr, re.M | re.I)
    pageCount = searchObj.group(1)

    startUrl = 'https://www.wdzj.com/dangan/search?filter=&sort=1&currentPage='
    baseUrl = 'https://www.wdzj.com'

    print("总页数:{0}".format(pageCount))
    for i in range(1, int(pageCount)+1):
    # for i in range(1, 2):

    urlPage = startUrl + str(i)
    pageHtml = self.request(urlPage)
    pageStrs = BeautifulSoup(pageHtml.text, "lxml").find('ul',attrs={'class':'terraceList'}).find_all('h2')

    print("---------------------------------")
    print("开始爬取第{0}页,共存在{1}个平台数据".format(i, len(pageStrs)))
    for p in pageStrs:
    a = p.find('a')
    self.getLinkFromPage(a.get_text(), baseUrl+a['href'])
    print("#", end='',flush=True)
    print(" 结束爬取第{0}页,共爬取{1}个平台数据".format(i,len(pageStrs)))
    self.saveToExcel(self.pingTaiInfo,fileName='p2p.xls')

    if __name__ == '__main__':
    w = wdzj_spider()
    w.getAllPage()
     

     做下修改

      1 #!/usr/bin/python3
      2 
      3 import requests
      4 from bs4 import BeautifulSoup
      5 import re
      6 import xlwt
      7 
      8 class wdzj_spider:
      9     pingTaiInfo = []
     10     book=None
     11     sheet=None
     12     excelTitle = {}
     13     rowNo = 1
     14 
     15     def request(self, url):
     16         headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
     17         return requests.get(url, headers=headers)
     18 
     19     def openExcel(self, fileName='resultme.xls',sheetName='www.wdzj.com'):
     20         self.book = xlwt.Workbook()  # 打开一个excel
     21         if self.book is None:
     22             print("创建文件{0}失败".format(fileName))
     23             return None
     24         self.sheet = self.book.add_sheet(sheetName)  # 根据顺序获取sheet
     25         if self.sheet is None:
     26             print("创建表单{0}失败".format(sheetName))
     27             return None
     28 
     29     def closeExcel(self, fileName='resultme.xls'):
     30         for m in self.excelTitle.items():
     31             self.sheet.write(0, m[1], m[0])
     32         self.book.save(fileName)
     33 
     34     def saveToExcel(self, infoList, fileName='resultme.xls'):
     35         # 对于要写入的每一个元素,先找他的title在哪列,写入到对应的列去
     36         for info in infoList:
     37             for item in info.items():
     38                 if str(item[0]) in self.excelTitle:
     39                     colNo = self.excelTitle[item[0]]
     40                 else:
     41                     colNo = len(self.excelTitle)
     42                     self.excelTitle[str(item[0])] = colNo
     43                 self.sheet.write(self.rowNo, colNo, str(item[1]))
     44             self.rowNo = self.rowNo + 1
     45         self.book.save(fileName)
     46         return self.rowNo
     47 
     48     def getDataplaneFromPage(self, Link):
     49         mainHtml = self.request(Link)
     50         dataInfo = {}
     51         #默认编码有问题,需手动转码
     52         txtUTF8 = str(mainHtml.content, 'utf-8')
     53         mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8")
     54         lis = mainBS.find_all('li', attrs={'class':'normal'})
     55         for tr in lis:
     56             divs = tr.find_all('div')
     57             dataInfo[divs[1].text.strip()] = divs[0].text.strip()
     58         # print("数据={0}".format(dataInfo))
     59         return dataInfo
     60 
     61     def getGongshangFromPage(self, Link):
     62         mainHtml = self.request(Link)
     63         gongshangInfo = {}
     64         #默认编码有问题,需手动转码
     65         txtUTF8 = str(mainHtml.content, 'utf-8')
     66         mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8")
     67         trs = mainBS.find('div', attrs={'class':'lcen'}).find_all('tr')
     68         for tr in trs:
     69             tdTitles = tr.find_all('td', attrs={'class':'t'})
     70             tds = tr.find_all('td')
     71             index = 1
     72             for td in tdTitles:
     73                 gongshangInfo[td.text] = tds[index].text.strip('    
    
    ')
     74                 index = index + 2
     75         # print("工商信息={0}".format(gongshangInfo))
     76         return gongshangInfo
     77 
     78     def getLinkFromPage(self, pingtaiName, pingtaiLink):
     79         shujuInfo = {}
     80         lianxifangshiInfo = {}
     81         pingtaiHtml = self.request(pingtaiLink)
     82         #默认编码有问题,需手动转码
     83         txtUTF8 = str(pingtaiHtml.content, 'utf-8')
     84         mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8")
     85         briefText = mainBS.find("div", class_="cen-zk").get_text().strip('    
    
    ')
     86         briefInfo={"P2P平台名称":pingtaiName, "简介":briefText}
     87         # print("简介={0}".format(briefInfo))
     88         gongshangLink = 'https://'+mainBS.find('a', text='工商/备案')['href'].lstrip('/')
     89         dataA = mainBS.find('div', attrs={'class':'common-header-nav'}).find('a', text='数据')
     90 
     91         lianxifangshiBox = mainBS.find('div', attrs={'class':"da-lxfs zzfwbox"})
     92         lianxifangshiTitles=[]
     93         lianxifangshiContents=[]
     94         if lianxifangshiBox:
     95             lianxifangshiTitles =lianxifangshiBox.find_all('div', class_='l')
     96             lianxifangshiContents = lianxifangshiBox.find_all('div', class_='r')
     97 
     98         for i in range(0, len(lianxifangshiTitles)):
     99             lianxifangshiInfo[lianxifangshiTitles[i].get_text().strip('    
    
    ')] = lianxifangshiContents[i].get_text().strip('    
    
    ')
    100         # print("联系方式={0}".format(lianxifangshiInfo))
    101 
    102         if dataA:
    103             dataLink =  'https://' + dataA['href'].lstrip('/')
    104             shujuInfo = self.getDataplaneFromPage(dataLink)
    105 
    106         gongshangInfo = self.getGongshangFromPage(gongshangLink)
    107         self.pingTaiInfo.append({**briefInfo, **gongshangInfo, **shujuInfo, **lianxifangshiInfo})
    108 
    109     def getAllPage(self):
    110         startUrl = 'https://www.wdzj.com/dangan/search?filter=&sort=1&currentPage=1'
    111         mainHtml = self.request(startUrl)
    112         pageStr = BeautifulSoup(mainHtml.text, "lxml").find("span", class_="all").text
    113         searchObj = re.search(r'1/([0-9]+)', pageStr, re.M | re.I)
    114         pageCount = searchObj.group(1)
    115 
    116         startUrl = 'https://www.wdzj.com/dangan/search?filter=&sort=1&currentPage='
    117         baseUrl = 'https://www.wdzj.com'
    118 
    119         print("总页数:{0}".format(pageCount))
    120         fileName = 'p2p.xls'
    121         self.openExcel(fileName)
    122         for i in range(1, int(pageCount)+1):
    123         # for i in range(1, 2):
    124             urlPage = startUrl + str(i)
    125             pageHtml = self.request(urlPage)
    126             pageStrs = BeautifulSoup(pageHtml.text, "lxml").find('ul',attrs={'class':'terraceList'}).find_all('h2')
    127 
    128             print("---------------------------------")
    129             print("开始爬取第{0}页,共存在{1}个平台数据".format(i, len(pageStrs)))
    130             count=0
    131             for p in pageStrs:
    132                 a = p.find('a')
    133                 try:
    134                     self.getLinkFromPage(a.get_text(), baseUrl+a['href'])
    135                     count = count + 1
    136                 except:
    137                     print("爬取第{0}个数据,名称<{1}>失败".format(count+1, a.get_text()))
    138                 print("#", end='',flush=True)
    139             print("
    结束爬取第{0}页,共爬取{1}个平台数据".format(i,count))
    140             self.saveToExcel(self.pingTaiInfo, fileName=fileName)
    141             self.pingTaiInfo.clear()
    142         self.closeExcel(fileName)
    143 
    144 if __name__ == '__main__':
    145     w = wdzj_spider()
    146     w.getAllPage()
    View Code
  • 相关阅读:
    浅入浅出---JQuery究竟是什么?
    How far away ?(DFS)
    SoC嵌入式软件架构设计之七:嵌入式文件系统设计
    Java Log Viewer日志查看器
    软件測试基本方法(三)之黑盒測试
    NLP1 —— Python自然语言处理环境搭建
    个性化搜索的几个阶段
    自然语言处理扫盲·第四天——白话人机问答系统原理
    自然语言处理扫盲·第三天——白话情感分析原理
    自然语言处理扫盲·第二天——白话机器翻译原理
  • 原文地址:https://www.cnblogs.com/qggg/p/11701570.html
Copyright © 2020-2023  润新知