import requests import time from lxml import etree import json #获取网页函数 def getpage(url): try: headers={'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Mobile Safari/537.36'} res=requests.get(url,headers=headers) if res.status_code==200: return res.text else: return None except: return None #解析网页函数 def parsepage(html): ht=etree.HTML(html) items=ht.xpath('//div[@class="item"]') for item in items: res={ 'title':item.xpath('.//span[@class="title"]/text()'), 'index':item.xpath('.//div[@class="item"]//em/text()'), 'score':item.xpath('.//span[@class="rating_num"]/text()'), 'actor':item.xpath('.//p[@class=""]/text()'), 'image':item.xpath('.//img[@width="100"]/@src') } yield res #写入文件 def writefile(item): with open('豆瓣.json','a',encoding='utf-8') as f: print('正在写入数据{}...'.format(item['title'])) f.write(json.dumps(item,ensure_ascii=False)) f.write(' ') #定义一个主函数 def main(offset): url='https://movie.douban.com/top250?start={}'.format(offset) html=getpage(url) print('正在解析程序.....') if html: for i in parsepage(html): writefile(i) if __name__=="__main__": for i in range(0,250,25): main(offset=i) time.sleep(2)