忧伤,没啥说的,弄了一下午,基本都是copy的,呀,正则泰难弄了,而且其他的一些第三方库也不怎么知道;
还是太垃圾了呀,加油吧
1 import requests 2 import time 3 import json 4 import re 5 from requests.exceptions import RequestException 6 def get_one_page(url): 7 try: 8 headers = { 9 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36', } 10 response = requests.get(url,headers = headers) 11 if response.status_code == 200: 12 print("测试点") 13 return response.text 14 return None 15 except RequestException: 16 return None 17 18 def parse_one_page(html): 19 pattern = re.compile( 20 '<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a' 21 +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' 22 + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S) 23 print("测试点4") 24 items = re.findall(pattern,html) 25 print("测试点5") 26 for item in items: 27 yield{ 28 '排名': item[0], 29 '图片': item[1], 30 '电影名': item[2], 31 '主演': item[3].strip()[3:], 32 '时间': item[4].strip()[5:], 33 '分数': item[5] + item[6] 34 } 35 def write_to_file(content): 36 with open(r'F:猫眼.txt','a+',encoding = 'utf-8') as f: 37 f.write(json.dumps(content,ensure_ascii = False)+' ') 38 39 def main(offset): 40 url = 'http://maoyan.com/board/4?offset=' + str(offset) 41 html = get_one_page(url) 42 print("测试点2") 43 for item in parse_one_page(html): 44 print("测试点3") 45 print(item) 46 write_to_file(item) 47 48 if __name__=="__main__": 49 for i in range(10): 50 print("测试点") 51 main(offset = i *10) 52 time.sleep(1)
主要是熟悉正则的使用
源代码:
https://github.com/Python3WebSpider/MaoYan/blob/master/spider.py