• 。。


    抓取猫眼电影排行

     

    提取猫眼电影TOP100的电影名称、时间、评分、图片等信息,提取的站点URL为,提取的结果会以文件形式保存下来。

    正则:

     1 from multiprocessing import Pool
     2 import json
     3 import requests
     4 from requests.exceptions import RequestException
     5 import re
     6 
     7 def get_one_page(url):
     8     headers = {
     9         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
    10     }
    11     try:
    12         response = requests.get(url, headers=headers)
    13         if response.status_code == 200:
    14             return response.text
    15         return None
    16     except RequestException:
    17         return None
    18 
    19 def parse_one_page(html):
    20     pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a'
    21                          + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
    22                          + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
    23     items = re.findall(pattern,html)
    24     for item in items:
    25         yield {
    26             'index':item[0],
    27             'image':item[1],
    28             'title':item[2],
    29             'actor':item[3].strip()[3:],
    30             'time': item[4].strip()[5:],
    31             'score': item[5] + item[6]
    32         }
    33 
    34 def write_to_file(content):
    35     with open('result.txt','a',encoding='utf-8') as f:
    36         f.write(json.dumps(content,ensure_ascii=False)+'
    ')
    37         f.close()
    38 
    39 def main (offset):
    40     url = 'http://maoyan.com/board/4?offset='+str(offset)
    41     html = get_one_page(url)
    42     for item in parse_one_page(html):
    43         print(item)
    44         write_to_file(item)
    45 
    46 if __name__ == '__main__':
    47     for i in range(10):
    48         main(offset = i * 10)
    49     pool = Pool()
    50     pool.map(main,[i*10 for i in range(10)])
     1 #from multiprocessing import Pool
     2 import json
     3 import requests
     4 from requests.exceptions import RequestException
     5 #import re
     6 from lxml import etree
     7 from urllib import parse
     8 
     9 def get_one_page(url):
    10     headers = {
    11         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
    12     }
    13     try:
    14         response = requests.get(url, headers=headers)
    15         if response.status_code == 200:
    16             return response.text
    17         return None
    18     except RequestException:
    19         return None
    20 
    21 def parse_one_page(html):
    22     text = etree.HTML(html)
    23     node_list = text.xpath('//dl[@board-wrapper]')
    24     items ={}
    25     for node in node_list:
    26         index = text.xpath('./dd[@class="board-index board-index-1"]')[0].text
    27         image = text.xpath('./dd/img[@class="board-img"]/@src')
    28         title = text.xpath('./dd/p[@class="name"]/a')[0].text
    29         actor = text.xpath('./dd/p[@class="star"]')[0].text.strip()[3:]
    30         time = text.xpath('./dd/p[@class="releasetime"]')[0].text.strip()[5:]
    31         score = text.xpath('./dd/p[@class="score"]').text
    32         items = {
    33             'index':index,
    34             'image':image,
    35             'title':title,
    36             'actor':actor,
    37             'time': time,
    38             'score': score
    39         }
    40         write_to_file(items)
    41 '''
    42 def parse_one_page(html):
    43     pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a'
    44                          + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
    45                          + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
    46     items = re.findall(pattern,html)
    47     for item in items:
    48         yield {
    49             'index':item[0],
    50             'image':item[1],
    51             'title':item[2],
    52             'actor':item[3].strip()[3:],
    53             'time': item[4].strip()[5:],
    54             'score': item[5] + item[6]
    55         }
    56 '''
    57 def write_to_file(content):
    58     with open('result.txt','a',encoding='utf-8') as f:
    59         f.write(json.dumps(content,ensure_ascii=False)+'
    ')
    60         f.close()
    61 
    62 def main (offset):
    63     url = 'http://maoyan.com/board/4?offset='+str(offset)
    64     html = get_one_page(url)
    65     parse_one_page(html)
    66     #for item in parse_one_page(html):
    67       #  print(item)
    68       #  write_to_file(items)
    69 
    70 if __name__ == '__main__':
    71     for i in range(10):
    72         main(offset = i * 10)
    73     #pool = Pool()
    74     #pool.map(main,[i*10 for i in range(10)])
  • 相关阅读:
    Idea快捷键---根据自己使用情况持续更新
    JVM 性能监控 工具
    redis ---RDB 和 AOF 持久策略对比
    数组、链表等常用数据结构和集合浅解(java)
    关于界面刷新嵌套展示(form标签 target 属性)问题
    对象是否存在的判定方法
    数据库大量插入数据的sql 优化
    Java集合之LinkedList
    Java集合类之ArrayList
    Java并发程序基础
  • 原文地址:https://www.cnblogs.com/wanglinjie/p/9201587.html
Copyright © 2020-2023  润新知