抓取猫眼电影排行
提取猫眼电影TOP100的电影名称、时间、评分、图片等信息,提取的站点URL为,提取的结果会以文件形式保存下来。
正则:
1 from multiprocessing import Pool 2 import json 3 import requests 4 from requests.exceptions import RequestException 5 import re 6 7 def get_one_page(url): 8 headers = { 9 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' 10 } 11 try: 12 response = requests.get(url, headers=headers) 13 if response.status_code == 200: 14 return response.text 15 return None 16 except RequestException: 17 return None 18 19 def parse_one_page(html): 20 pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a' 21 + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' 22 + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S) 23 items = re.findall(pattern,html) 24 for item in items: 25 yield { 26 'index':item[0], 27 'image':item[1], 28 'title':item[2], 29 'actor':item[3].strip()[3:], 30 'time': item[4].strip()[5:], 31 'score': item[5] + item[6] 32 } 33 34 def write_to_file(content): 35 with open('result.txt','a',encoding='utf-8') as f: 36 f.write(json.dumps(content,ensure_ascii=False)+' ') 37 f.close() 38 39 def main (offset): 40 url = 'http://maoyan.com/board/4?offset='+str(offset) 41 html = get_one_page(url) 42 for item in parse_one_page(html): 43 print(item) 44 write_to_file(item) 45 46 if __name__ == '__main__': 47 for i in range(10): 48 main(offset = i * 10) 49 pool = Pool() 50 pool.map(main,[i*10 for i in range(10)])
1 #from multiprocessing import Pool 2 import json 3 import requests 4 from requests.exceptions import RequestException 5 #import re 6 from lxml import etree 7 from urllib import parse 8 9 def get_one_page(url): 10 headers = { 11 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' 12 } 13 try: 14 response = requests.get(url, headers=headers) 15 if response.status_code == 200: 16 return response.text 17 return None 18 except RequestException: 19 return None 20 21 def parse_one_page(html): 22 text = etree.HTML(html) 23 node_list = text.xpath('//dl[@board-wrapper]') 24 items ={} 25 for node in node_list: 26 index = text.xpath('./dd[@class="board-index board-index-1"]')[0].text 27 image = text.xpath('./dd/img[@class="board-img"]/@src') 28 title = text.xpath('./dd/p[@class="name"]/a')[0].text 29 actor = text.xpath('./dd/p[@class="star"]')[0].text.strip()[3:] 30 time = text.xpath('./dd/p[@class="releasetime"]')[0].text.strip()[5:] 31 score = text.xpath('./dd/p[@class="score"]').text 32 items = { 33 'index':index, 34 'image':image, 35 'title':title, 36 'actor':actor, 37 'time': time, 38 'score': score 39 } 40 write_to_file(items) 41 ''' 42 def parse_one_page(html): 43 pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a' 44 + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' 45 + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S) 46 items = re.findall(pattern,html) 47 for item in items: 48 yield { 49 'index':item[0], 50 'image':item[1], 51 'title':item[2], 52 'actor':item[3].strip()[3:], 53 'time': item[4].strip()[5:], 54 'score': item[5] + item[6] 55 } 56 ''' 57 def write_to_file(content): 58 with open('result.txt','a',encoding='utf-8') as f: 59 f.write(json.dumps(content,ensure_ascii=False)+' ') 60 f.close() 61 62 def main (offset): 63 url = 'http://maoyan.com/board/4?offset='+str(offset) 64 html = get_one_page(url) 65 parse_one_page(html) 66 #for item in parse_one_page(html): 67 # print(item) 68 # write_to_file(items) 69 70 if __name__ == '__main__': 71 for i in range(10): 72 main(offset = i * 10) 73 #pool = Pool() 74 #pool.map(main,[i*10 for i in range(10)])