2017房价
import requests import bs4 import re import openpyxl def open_url(url): headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"} res=requests.get(url,headers=headers) return res def find_infor(res) -> object: data=[] soup=bs4.BeautifulSoup(res.text,'html.parser') content=soup.find(id="C-Main-Article-QQ") targets=content.find_all('p',style="TEXT-INDENT: 2em") targets=iter(targets) #因为targets是一个列表,是一个可迭代对象;可以用for循环来迭代它,但是它本身不是一个迭代器;直接用的话会提示: ##'XXX' object is not an iterator.判断断一个对象是不是一个迭代器,可以用next()函数来迭代它/作用它; ##targets=iter(targets)行是通过用iter()函数吧targets转换为了迭代器,所以在下面的for循环中对targets可以直接应用next()函数了。 for each in targets: # print(each.text) # if each.text.isnumeric(): # for i in range(4): # data.append(next(targets).text) if each.text.isnumeric(): data.append([re.search(r'[(.+)]',next(targets).text).group(1), re.search(r'd.*', next(targets).text).group(), re.search(r'd.*', next(targets).text).group(), re.search(r'd.*', next(targets).text).group()]) return data # return content def to_excel(data): wb=openpyxl.Workbook() wb.guess_types=True ws=wb.active ws.append(['城市','平均房价','平均工资','房价工资比']) for each in data: ws.append(each) wb.save("2017年中国主要城市房价工资比排行榜.xlsx") def main(): url = 'https://news.house.qq.com/a/20170702/003985.htm' res = open_url(url) data=find_infor(res) to_excel(data) # print(data) # content=find_infor(res) # with open("2017各城市房价1.txt",'w',encoding='utf-8') as f: # f.write(content.text) if __name__=="__main__": main()