主要两个的python代码如下:
import requests from bs4 import BeautifulSoup url = 'https://www.17k.com/' headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'} response = requests.get(url,headers = headers) content = response.content.decode('utf-8') soup = BeautifulSoup(content, 'html.parser') listA = soup.find_all(name='ul',attrs={"class":"Top1"}) a=0 movie_list=[] for each in listA: all1=each.find("li").a.get("href").strip() all2=each.find("li").a.text.strip("[]") movie_list.append([" 电影名: ",all2,"电影链接: ",all1]) with open("17kmovie.txt","w+",encoding="utf-8") as f: for i in range(len(movie_list)): f.write(str(movie_list[i])) f.write(" ") f.close()
import requests from bs4 import BeautifulSoup def get_movie(): url = 'https://movie.douban.com/top250' #请求地址 headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息 movie_list=[] for i in range(0,10): url = 'https://movie.douban.com/top250?start='+str(i*25) response=requests.get(url,headers=headers) soup=BeautifulSoup(response.text,"html.parser") div_list = soup.find_all('div', class_='info') for each in div_list: title = each.find('div', class_="hd").span.text.strip() title2 = each.find('div', class_="hd").a.get("href").strip() info = each.find('div', class_='bd').p.text.strip() info = info.replace('\n', '').replace('\xa0', '') info = ' '.join(info.split()) star = each.find('span', class_='rating_num').text.strip() people = each.find('div', class_='star').contents[7].text.strip() movie_list.append(["电影名: ",title, "电影链接 ",title2,info, star, people]) return movie_list movie=[] movie=get_movie() with open("Top_movie_250.txt","w+",encoding="utf-8") as f: for i in range(len(movie)): f.write(str(movie[i])) f.write(" ") f.close()
实验结果如下:
将其写到文件中:
用到的都是之前学到的知识点。
(发现的文体是。有的时候例如span语句,存在没有改属性的情况。进而获得text会出现属性失败的错误。最后自己发现通过测试解决的)