#-*-coding:utf-8-*-
from urllib import request,parse
import requests
import importlib,sys,re
importlib.reload(sys)
import os
import pdb
# sys.setdefaultencoding('utf8')
file_name=(r'E:YSpracticemovie'+os.sep+'豆瓣电影排行250'+'.txt')
number=1
with open(file_name,"w",encoding='utf-8') as f:
# f.writelines(content)
for i in range(10):
print('正在爬去第%d页'%(i+1))
url='https://movie.douban.com/top250?'
a=i
# I=str(i)
data={ 'start':a
}
string=parse.urlencode(data)
url+=string+'&filter='
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'}
request_data=request.Request(url=url,headers=header)
# pdb.set_trace()
response=request.urlopen(request_data)
# pdb.set_trace()
content=response.read().decode('utf-8')
pattern = r'<span class="title">.*?</span>'
m=re.findall(pattern,content)
print (m)
if m!=None:
for item in m:
if ' ' not in item:
f.writelines(u'第%d名'%number+'-------'+item.split(">")[1].split("<")[0]+'
')
number+=1
print('第%d页爬出成功'%(i+1))