用requests和bs做个简单的爬取网DAI之家的例子。
只做笔记用。
#!/usr/bin/python3
import requests
from bs4 import BeautifulSoup
import re
import xlwt
class wdzj_spider:
pingTaiInfo = []
def request(self, url):
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
return requests.get(url, headers=headers)
def saveToExcel(self, infoList, fileName='resultme.xls', sheetName='www.wdzj.com'):
# 对于要写入的每一个元素,先找他的title在哪列,写入到对应的列去
rowNo = 1
excelTitle = {}
book = xlwt.Workbook() # 打开一个excel
if book is None:
print("创建文件{0}失败".format(fileName))
return None
sheet = book.add_sheet(sheetName) # 根据顺序获取sheet
if sheet is None:
print("创建表单{0}失败".format(sheetName))
return None
for info in infoList:
for item in info.items():
if str(item[0]) in excelTitle:
colNo = excelTitle[item[0]]
else:
colNo = len(excelTitle)
excelTitle[str(item[0])] = colNo
sheet.write(rowNo, colNo, str(item[1]))
rowNo = rowNo + 1
for m in excelTitle.items():
sheet.write(0, m[1], m[0])
book.save(fileName)
return rowNo
def getDataplaneFromPage(self, Link):
mainHtml = self.request(Link)
dataInfo = {}
#默认编码有问题,需手动转码
txtUTF8 = str(mainHtml.content, 'utf-8')
mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8")
lis = mainBS.find_all('li', attrs={'class':'normal'})
for tr in lis:
divs = tr.find_all('div')
dataInfo[divs[1].text.strip()] = divs[0].text.strip()
# print("数据={0}".format(dataInfo))
return dataInfo
def getGongshangFromPage(self, Link):
mainHtml = self.request(Link)
gongshangInfo = {}
#默认编码有问题,需手动转码
txtUTF8 = str(mainHtml.content, 'utf-8')
mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8")
trs = mainBS.find('div', attrs={'class':'lcen'}).find_all('tr')
for tr in trs:
tdTitles = tr.find_all('td', attrs={'class':'t'})
tds = tr.find_all('td')
index = 1
for td in tdTitles:
gongshangInfo[td.text] = tds[index].text.strip(' ')
index = index + 2
# print("工商信息={0}".format(gongshangInfo))
return gongshangInfo
def getLinkFromPage(self, pingtaiName, pingtaiLink):
shujuInfo = {}
lianxifangshiInfo = {}
pingtaiHtml = self.request(pingtaiLink)
#默认编码有问题,需手动转码
txtUTF8 = str(pingtaiHtml.content, 'utf-8')
mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8")
briefText = mainBS.find("div", class_="cen-zk").get_text().strip(' ')
briefInfo={"P2P平台名称":pingtaiName, "简介":briefText}
# print("简介={0}".format(briefText))
gongshangLink = 'https://'+mainBS.find('a', text='工商/备案')['href'].lstrip('/')
dataA = mainBS.find('div', attrs={'class':'common-header-nav'}).find('a', text='数据')
lianxifangshiTitles =mainBS.find('div', attrs={'class':"da-lxfs zzfwbox"}).find_all('div', class_='l')
lianxifangshiContents = mainBS.find('div', attrs={'class': "da-lxfs zzfwbox"}).find_all('div', class_='r')
for i in range(0, len(lianxifangshiTitles)):
lianxifangshiInfo[lianxifangshiTitles[i].get_text().strip(' ')] = lianxifangshiContents[i].get_text().strip(' ')
# print("联系方式={0}".format(lianxifangshiInfo))
if dataA:
dataLink = 'https://' + dataA['href'].lstrip('/')
shujuInfo = self.getDataplaneFromPage(dataLink)
gongshangInfo = self.getGongshangFromPage(gongshangLink)
self.pingTaiInfo.append({**briefInfo, **gongshangInfo, **shujuInfo, **lianxifangshiInfo})
def getAllPage(self):
startUrl = 'https://www.wdzj.com/dangan/search?filter=&sort=1¤tPage=1'
mainHtml = self.request(startUrl)
pageStr = BeautifulSoup(mainHtml.text, "lxml").find("span", class_="all").text
searchObj = re.search(r'1/([0-9]+)', pageStr, re.M | re.I)
pageCount = searchObj.group(1)
startUrl = 'https://www.wdzj.com/dangan/search?filter=&sort=1¤tPage='
baseUrl = 'https://www.wdzj.com'
print("总页数:{0}".format(pageCount))
for i in range(1, int(pageCount)+1):
# for i in range(1, 2):
urlPage = startUrl + str(i)
pageHtml = self.request(urlPage)
pageStrs = BeautifulSoup(pageHtml.text, "lxml").find('ul',attrs={'class':'terraceList'}).find_all('h2')
print("---------------------------------")
print("开始爬取第{0}页,共存在{1}个平台数据".format(i, len(pageStrs)))
for p in pageStrs:
a = p.find('a')
self.getLinkFromPage(a.get_text(), baseUrl+a['href'])
print("#", end='',flush=True)
print(" 结束爬取第{0}页,共爬取{1}个平台数据".format(i,len(pageStrs)))
self.saveToExcel(self.pingTaiInfo,fileName='p2p.xls')
if __name__ == '__main__':
w = wdzj_spider()
w.getAllPage()
做下修改
1 #!/usr/bin/python3 2 3 import requests 4 from bs4 import BeautifulSoup 5 import re 6 import xlwt 7 8 class wdzj_spider: 9 pingTaiInfo = [] 10 book=None 11 sheet=None 12 excelTitle = {} 13 rowNo = 1 14 15 def request(self, url): 16 headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} 17 return requests.get(url, headers=headers) 18 19 def openExcel(self, fileName='resultme.xls',sheetName='www.wdzj.com'): 20 self.book = xlwt.Workbook() # 打开一个excel 21 if self.book is None: 22 print("创建文件{0}失败".format(fileName)) 23 return None 24 self.sheet = self.book.add_sheet(sheetName) # 根据顺序获取sheet 25 if self.sheet is None: 26 print("创建表单{0}失败".format(sheetName)) 27 return None 28 29 def closeExcel(self, fileName='resultme.xls'): 30 for m in self.excelTitle.items(): 31 self.sheet.write(0, m[1], m[0]) 32 self.book.save(fileName) 33 34 def saveToExcel(self, infoList, fileName='resultme.xls'): 35 # 对于要写入的每一个元素,先找他的title在哪列,写入到对应的列去 36 for info in infoList: 37 for item in info.items(): 38 if str(item[0]) in self.excelTitle: 39 colNo = self.excelTitle[item[0]] 40 else: 41 colNo = len(self.excelTitle) 42 self.excelTitle[str(item[0])] = colNo 43 self.sheet.write(self.rowNo, colNo, str(item[1])) 44 self.rowNo = self.rowNo + 1 45 self.book.save(fileName) 46 return self.rowNo 47 48 def getDataplaneFromPage(self, Link): 49 mainHtml = self.request(Link) 50 dataInfo = {} 51 #默认编码有问题,需手动转码 52 txtUTF8 = str(mainHtml.content, 'utf-8') 53 mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8") 54 lis = mainBS.find_all('li', attrs={'class':'normal'}) 55 for tr in lis: 56 divs = tr.find_all('div') 57 dataInfo[divs[1].text.strip()] = divs[0].text.strip() 58 # print("数据={0}".format(dataInfo)) 59 return dataInfo 60 61 def getGongshangFromPage(self, Link): 62 mainHtml = self.request(Link) 63 gongshangInfo = {} 64 #默认编码有问题,需手动转码 65 txtUTF8 = str(mainHtml.content, 'utf-8') 66 mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8") 67 trs = mainBS.find('div', attrs={'class':'lcen'}).find_all('tr') 68 for tr in trs: 69 tdTitles = tr.find_all('td', attrs={'class':'t'}) 70 tds = tr.find_all('td') 71 index = 1 72 for td in tdTitles: 73 gongshangInfo[td.text] = tds[index].text.strip(' ') 74 index = index + 2 75 # print("工商信息={0}".format(gongshangInfo)) 76 return gongshangInfo 77 78 def getLinkFromPage(self, pingtaiName, pingtaiLink): 79 shujuInfo = {} 80 lianxifangshiInfo = {} 81 pingtaiHtml = self.request(pingtaiLink) 82 #默认编码有问题,需手动转码 83 txtUTF8 = str(pingtaiHtml.content, 'utf-8') 84 mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8") 85 briefText = mainBS.find("div", class_="cen-zk").get_text().strip(' ') 86 briefInfo={"P2P平台名称":pingtaiName, "简介":briefText} 87 # print("简介={0}".format(briefInfo)) 88 gongshangLink = 'https://'+mainBS.find('a', text='工商/备案')['href'].lstrip('/') 89 dataA = mainBS.find('div', attrs={'class':'common-header-nav'}).find('a', text='数据') 90 91 lianxifangshiBox = mainBS.find('div', attrs={'class':"da-lxfs zzfwbox"}) 92 lianxifangshiTitles=[] 93 lianxifangshiContents=[] 94 if lianxifangshiBox: 95 lianxifangshiTitles =lianxifangshiBox.find_all('div', class_='l') 96 lianxifangshiContents = lianxifangshiBox.find_all('div', class_='r') 97 98 for i in range(0, len(lianxifangshiTitles)): 99 lianxifangshiInfo[lianxifangshiTitles[i].get_text().strip(' ')] = lianxifangshiContents[i].get_text().strip(' ') 100 # print("联系方式={0}".format(lianxifangshiInfo)) 101 102 if dataA: 103 dataLink = 'https://' + dataA['href'].lstrip('/') 104 shujuInfo = self.getDataplaneFromPage(dataLink) 105 106 gongshangInfo = self.getGongshangFromPage(gongshangLink) 107 self.pingTaiInfo.append({**briefInfo, **gongshangInfo, **shujuInfo, **lianxifangshiInfo}) 108 109 def getAllPage(self): 110 startUrl = 'https://www.wdzj.com/dangan/search?filter=&sort=1¤tPage=1' 111 mainHtml = self.request(startUrl) 112 pageStr = BeautifulSoup(mainHtml.text, "lxml").find("span", class_="all").text 113 searchObj = re.search(r'1/([0-9]+)', pageStr, re.M | re.I) 114 pageCount = searchObj.group(1) 115 116 startUrl = 'https://www.wdzj.com/dangan/search?filter=&sort=1¤tPage=' 117 baseUrl = 'https://www.wdzj.com' 118 119 print("总页数:{0}".format(pageCount)) 120 fileName = 'p2p.xls' 121 self.openExcel(fileName) 122 for i in range(1, int(pageCount)+1): 123 # for i in range(1, 2): 124 urlPage = startUrl + str(i) 125 pageHtml = self.request(urlPage) 126 pageStrs = BeautifulSoup(pageHtml.text, "lxml").find('ul',attrs={'class':'terraceList'}).find_all('h2') 127 128 print("---------------------------------") 129 print("开始爬取第{0}页,共存在{1}个平台数据".format(i, len(pageStrs))) 130 count=0 131 for p in pageStrs: 132 a = p.find('a') 133 try: 134 self.getLinkFromPage(a.get_text(), baseUrl+a['href']) 135 count = count + 1 136 except: 137 print("爬取第{0}个数据,名称<{1}>失败".format(count+1, a.get_text())) 138 print("#", end='',flush=True) 139 print(" 结束爬取第{0}页,共爬取{1}个平台数据".format(i,count)) 140 self.saveToExcel(self.pingTaiInfo, fileName=fileName) 141 self.pingTaiInfo.clear() 142 self.closeExcel(fileName) 143 144 if __name__ == '__main__': 145 w = wdzj_spider() 146 w.getAllPage()