从贝壳网获取房价信息。
基本的步骤和我的这篇博文一样:https://www.cnblogs.com/mrlayfolk/p/12319414.html。不熟悉的可参考一下。
下面的代码是获取3000个样本的代码。
1 # encoding:utf-8 2 3 ''' 4 目的:从贝壳找房中爬取房价信息。网址:https://cd.ke.com/ershoufang/qingyang/l2/ 5 环境:python 3.7.3 6 所需的库:requests、BeautifulSoup、xlwt 7 ''' 8 9 import logging 10 import xlwt 11 import requests 12 import string 13 from bs4 import BeautifulSoup 14 15 headers = { 16 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36', 17 "Host": "cd.ke.com", 18 } 19 20 # 将获取的信息保存到表格中 21 def save_info(content): 22 workbook = xlwt.Workbook(encoding = 'ascii') 23 worksheet = workbook.add_sheet('house info') 24 style = xlwt.XFStyle() # 初始化样式 25 font = xlwt.Font() # 为样式创建字体 26 font.name = 'Times New Roman' 27 font.bold = True # 黑体 28 font.underline = True # 下划线 29 font.italic = True # 斜体字 30 style.font = font # 设定样式 31 worksheet.write(0, 0, '名称') 32 worksheet.write(0, 1, '位置') 33 worksheet.write(0, 2, '房屋信息') 34 worksheet.write(0, 3, '总价(万)') 35 worksheet.write(0, 4, '单价(元/平方米)') 36 37 for i, item in enumerate(content): 38 for j in range(5): #多添加一列(序号) 39 worksheet.write(i+1, j, content[i][j]) 40 workbook.save('./house_info.xls') # 保存文件 41 42 43 # 获取房屋相关的信息 44 # 主要包括:title positon houseinfo totalprice unitprice 45 def get_info(): 46 all_info = [] 47 title_list = [] 48 position_list = [] 49 house_list = [] 50 totalPrice_list = [] 51 unitPrice_list = [] 52 53 for i in range(100): 54 link = 'https://cd.ke.com/ershoufang/qingyang/pg%dl2/' % i 55 r = requests.get(link, headers=headers, timeout=10) 56 print (str(i+1), 'status_code: ', r.status_code) 57 soup = BeautifulSoup(r.text, 'lxml') 58 titleInfo = soup.findAll('div', {'class': 'info clear'}) 59 positionInfo = soup.findAll('div', {'class': 'positionInfo'}) 60 houseInfo = soup.findAll('div', {'class': 'houseInfo'}) 61 totalPrice = soup.findAll('div', {'class': 'totalPrice'}) 62 unitPrice = soup.findAll('div', {'class': 'unitPrice'}) 63 for item in titleInfo: 64 title = item.div.a.text.strip() 65 title_list.append(title) 66 for item in positionInfo: 67 postion = item.a.text.strip() 68 position_list.append(postion) 69 for item in houseInfo: 70 house = item.text.strip().replace(' ', ' ').replace(' ', '') 71 house_list.append(house) 72 for item in totalPrice: 73 total_price = item.span.text.strip() 74 totalPrice_list.append(total_price) 75 for item in unitPrice: 76 unit_price = item.span.text.strip().replace('单价', '').replace('元/平米', '') 77 unitPrice_list.append(unit_price) 78 print (len(title_list)) 79 print (len(position_list)) 80 print (len(house_list)) 81 print (len(totalPrice_list)) 82 print (len(unitPrice_list)) 83 for i in range(len(title_list)): 84 item = [title_list[i], position_list[i], house_list[i], totalPrice_list[i], unitPrice_list[i]] 85 all_info.append(item) 86 87 return all_info 88 89 90 if __name__ == "__main__": 91 all_info = get_info() 92 save_info(all_info)