心血来潮,爬取豆瓣电影Top250信息,几个课题记录下:
分两种数据解析方式:
第一为parsel;
第二为将html数据转换成python对象,利用xpath进行解析,对xpath进行复习,用到lxml里的etree。
数据保存方式:
第一为csv;
第二为openpyxl;
第三为pandas;
第四种通过pymsql保存到MySql数据库。
多线程抓取
首先是单线parsel解析数据,csv保存数据。豆瓣电影top250代码基本不变,爬虫代码大致一样。
1 import csv 2 import re 3 import requests 4 import parsel 5 import time 6 import random 7 8 f = open('豆瓣top250电影信息.csv', mode='a', encoding='utf-8-sig', newline='') 9 csvWriter = csv.DictWriter(f, fieldnames=[ 10 '电影名', 11 '外文名', 12 '港台名', 13 '是否可播放', 14 '上映年份', 15 '详情地址', 16 '导演', 17 '编剧', 18 '主演', 19 '评分', 20 '评分人数', 21 ]) 22 23 csvWriter.writeheader() # 写入头 24 headers = { 25 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36', 26 } 27 28 def get_data(url): 29 """ 30 传入url进行数据采集 31 """ 32 response = requests.get(url=url, headers=headers) 33 selector = parsel.Selector(response.text) # 选择器 34 # 选取需要的列表 35 lis = selector.css('#content ol li') 36 # 循环读取需要的数据 37 for item in lis: 38 title = item.css('.info .hd a span:nth-child(1)::text').get() # 中文标题 39 try: 40 foreignTitle = item.css('.info .hd a span:nth-child(2)::text').get().strip().replace('/', '') # 外文标题,包括英语和其他语言 41 except: 42 foreignTitle = '未定!' 43 try: 44 gtTitle = item.css('.info .hd a .other::text').get().strip().lstrip('/') # 港台标题 45 except: 46 gtTtile = '未定!' 47 try: 48 canbePlayed = item.css('.playable::text').get().strip('[').strip(']') # 是否可播放 49 except: 50 canbePlayed = '未知!' 51 movieinfo = item.css('.bd p::text').getall() # 导演演员和上映年份等信息 52 # print(movieinfo) 53 # director = movieinfo[0].split('\xa0\xa0\xa0')[0].strip() # 导演 54 # print(director) 55 # actors = movieinfo[0].split('\xa0\xa0\xa0')[1].strip() # 演员 56 # print(actors) 57 releaseYear = movieinfo[1].split('/')[0].strip() 58 # print(releaseYear) 59 60 detaiPage = item.css('.hd a::attr(href)').get() # 获取详情页去请求导演和演员信息 61 if detaiPage: 62 response = requests.get(url=detaiPage,headers=headers) 63 selector = parsel.Selector(response.text) 64 results = selector.css('#info') 65 mactors = selector.css('.actor .attrs') 66 for i in results: 67 director = i.css('span:nth-child(1) a::text').get() # 导演 68 scenarist = i.css('span:nth-child(3) .attrs a::text').getall() # 编剧 69 scenarist = ' / '.join(scenarist) # 转换成字符串 70 for j in mactors: 71 allactor = j.css('span a::text').getall() 72 actors = ' / '.join(allactor) 73 74 reviewScore = item.css('.star .rating_num::text').get() 75 reviewCount = item.css('.star span:nth-child(4)::text').get().strip('人评价') # 总评论人数s 76 print(title, foreignTitle, gtTitle, canbePlayed, releaseYear, detaiPage, director, scenarist, actors, reviewScore 77 , reviewCount, sep=' | ') 78 dit = { 79 '电影名': title, 80 '外文名': foreignTitle, 81 '港台名': gtTitle, 82 '是否可播放': canbePlayed, 83 '上映年份': releaseYear, 84 '详情地址': detaiPage, 85 '导演': director, 86 '编剧': scenarist, 87 '主演': actors, 88 '评分': reviewScore, 89 '评分人数': reviewCount, 90 } 91 csvWriter.writerow(dit) # 逐行写入 92 # break 93 94 for page in range(0, 250 * 25 + 1, 25): 95 print(f'--------------------------------------------正在采集第{int(page/25) + 1}页数据--------------------------------------------') 96 url = f'https://movie.douban.com/top250?start={page}&filter=' 97 time.sleep(random.uniform(2, 5)) 98 get_data(url)