#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Author;Tsukasa import requests from bs4 import BeautifulSoup import pandas import time url_all = [] url_in = input('输入你所需要城市的字母简写: 如:中山 zs , 广州 gz !!!不要乱输入,不然运行不了') url_number = 1+int(input('输入爬取页数:')) okl = [] def open(nobe): res = requests.get(nobe) soup = BeautifulSoup(res.text,'html5lib') http_start = [] url_start = 'http://esf.'+url_in+'.fang.com' for title in soup.select('.houseList dl'): #网址链接列表 url_end = title.select('.title a ')[0]['href'] http_start.append(url_start + url_end) return http_start #获取详细信息 def content(url): info = {} info['网页'] = url res = requests.get(url) soup = BeautifulSoup(res.text,'html5lib') info['标题'] = soup.select('h1')[0].text.strip() #获取标题 info['总价'] = soup.select('.red20b')[0].text + '万' #总价 info['联系电话'] = soup.select('#mobilecode')[0].text #电话 for sl in soup.select('span'): #获取发布时间 if '发布时间' in sl.text.lstrip('<span>'): key , value = (sl.text.strip().rstrip('(').split(':')) info[key] = value + '*' + soup.select('#Time')[0].text for dd in soup.select('dd'): #获取详细内容 if ':' in dd.text.strip(): key , value = (dd.text.strip().split(':')) info[key] = value return info print('----------正在运行,请不要关闭----------') url_home = ('http://esf.'+ url_in + '.fang.com/house/i3{}/') for url_next in range(1,url_number): url_all.append((url_home.format(url_next))) home = [] for i in url_all: a = (open(i)) print('正在获取 -----> ',i,' <-----') time.sleep(1) for b in a: home.append(content(b)) print(' 正在获取详细信息 -> ',b,' <-----') time.sleep(2) #home.append(content(open(i[0]))) last = pandas.DataFrame(home) last.to_excel('temp.xlsx',sheet_name='房源信息') print('----------运行结束---------- ----------查看根目录---------') abcdefg = input('完成运行')
源码先奉上,以后在填坑