自己跟着视频学习的第一个爬虫小程序,里面有许多不太清楚的地方,不如怎么找到具体的电影名字的,那么多级关系,怎么以下就找到的是那个div呢?
诸如此类的,有许多,不过先做起来再说吧,后续再取去弄懂。
import requests import bs4 import re def open_url(url): #使用代理 # proxies={'http':"127.0.0.1:1080",'https':'127.0.0.1:1080'} headers={"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36 QIHU 360EE"} headers1={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'} #res=requests.get(url,headers=headers,proxies=proxies) res=requests.get(url,headers=headers1) return res def find_movies(res): soup=bs4.BeautifulSoup(res.text,'html.parser') #电影名 movies=[] targets=soup.find_all('div',class_="hd") for each in targets: movies.append(each.a.span.text) #评分 ranks=[] targets=soup.find_all('span',class_='rating_num') for each in targets: ranks.append('评分:{}'.format(each.text)) #资料 messages=[] targets=soup.find_all("div",class_='bd') for each in targets: try: messages.append(each.p.text.split(' ')[1].strip()+each.p.text.split(' ')[2].strip()) except: continue result=[] length=len(movies) for i in range(length): result.append(movies[i]+ranks[i]+messages[i]+' ') return result #找出一共有多少个页面 def find_depth(res): soup=bs4.BeautifulSoup(res.text,'html.parser') depth=soup.find('span',class_='next').previous_sibling.previous_sibling.text return int(depth) def main(): host="https://movie.douban.com/top250" res=open_url(host) depth=find_depth(res) result=[] for i in range(depth): url=host+'/?start='+str(25*i) res=open_url(url) result.extend(find_movies(res)) with open("豆瓣TOP250电影.txt","w",encoding='utf-8') as f: for each in result: f.write(each) if __name__=="__main__": main()