• 使用常规方法爬取猫眼电影


    1:首先确定要爬取的网站:爬取的url链接规律,请求方式时post还是get,

    2:然后简单书写爬虫进行网页测试:

    import requests
    from requests.exceptions import RequestException
    
    def get_one_page(url):
        headers = {
            "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)"
        }
        try:
            response = requests.get(url,headers = headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return  None
    
    def main():
       url = 'http://maoyan.com/board/4?'
       html = get_one_page(url)
       print(html)
    
    
    
    if __name__ =="__main__":
       main()

    3:测试通过后,增加网页循环对爬取内容进行处理,然后方法一保存为txt格式,方法二保存为csv格式:

    # !/usr/bin/env python
    # -*- coding:utf-8 -*-
    import requests
    import re
    import time
    import json
    from requests.exceptions import RequestException
    
    def get_one_page(url):
        headers = {
            "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)"
        }
        try:
            response = requests.get(url,headers = headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return  None
    
    # 定义parse_one_page,对html进行解析,re.S表示匹配任何非空白字符,其中(.*?)表示匹配的内容:
    def parse_one_page(html):
          pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name"><a'
                               +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                                +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
    
           #html表示匹配的目标对象
          items= re.findall(pattern, html)
          for item in items:
              yield {
                  "index":item[0],
                  "image":item[1],
                  "name":item[2],
                  "actor":item[3].strip(),
                  "time":item[4].strip(),
                  "star":item[5]+item[6],
           }
    
    def main(offset):
         url = 'http://maoyan.com/board/4?offset='+ str(offset)
         html = get_one_page(url)
         for item in parse_one_page(html):
             print(item)
             # write_to_file(item)
             write_to_csv(item)
    
    def write_to_csv(content):
        with open("猫眼result.csv",'a',encoding='utf-8') as f:
            f.write(json.dumps(content,ensure_ascii=False) +'
    ')
    
    # def write_to_file(content):
    #     # a表示追加的方式进行添加
    #     with open('猫眼result.txt', 'a', encoding='utf-8') as f:
    #         f.write(json.dumps(content, ensure_ascii=False) + '
    ')
    
    
    
    
    if __name__ =="__main__":
       for i in range(10):
           main(offset = i * 10)
           time.sleep(1)

    使用进程池抓取:

    # !/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    # !/usr/bin/env python
    # -*- coding:utf-8 -*-
    import requests
    import re
    import time
    import json
    from multiprocessing import Pool
    from requests.exceptions import RequestException
    
    def get_one_page(url):
        headers = {
            "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)"
        }
        try:
            response = requests.get(url,headers = headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return  None
    
    # 定义parse_one_page,对html进行解析,re.S表示匹配任何非空白字符,其中(.*?)表示匹配的内容:
    def parse_one_page(html):
          pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name"><a'
                               +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                                +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
    
           #html表示匹配的目标对象
          items= re.findall(pattern, html)
          for item in items:
              yield {
                  "index":item[0],
                  "image":item[1],
                  "name":item[2],
                  "actor":item[3].strip(),
                  "time":item[4].strip(),
                  "star":item[5]+item[6],
           }
    
    def main(offset):
         url = 'http://maoyan.com/board/4?offset='+ str(offset)
         html = get_one_page(url)
         for item in parse_one_page(html):
             print(item)
             # write_to_file(item)
             write_to_csv(item)
    
    def write_to_csv(content):
        with open("猫眼进程result.csv",'a',encoding='utf-8') as f:
            f.write(json.dumps(content,ensure_ascii=False) +'
    ')
    
    # def write_to_file(content):
    #     # a表示追加的方式进行添加
    #     with open('猫眼result.txt', 'a', encoding='utf-8') as f:
    #         f.write(json.dumps(content, ensure_ascii=False) + '
    ')
    
    
    
    
    if __name__ =="__main__":
       pool = Pool()
       pool.map(main,[i*10 for i in range(10)])
  • 相关阅读:
    SharePoint 2013中的Index Partition的一个小问题
    SharePoint 2013中, 默认Index文件的位置
    Visual Studio Test Project的一个小问题
    HyperV最佳实践
    测试环境中的一个HyperV的选项设置
    什么是SharePoint 2013中的Shredded Storage?
    SharePoint的数据库性能需要注意的一点
    记录HyperV中挪动虚拟机的一次实践
    SharePoint 2013上一台机器可以有多个Crawl Component么?
    SharePoint Client Object Model的一个小例子
  • 原文地址:https://www.cnblogs.com/lmx123/p/9480511.html
Copyright © 2020-2023  润新知