• 多线程爬虫


    应用场景

      1、多进程 :CPU密集程序
      2、多线程 :爬虫(网络I/O)、本地磁盘I/O

    队列

    # 导入模块
    from queue import Queue
    # 使用
    q = Queue()
    q.put(url)
    q.get() # 当队列为空时,阻塞
    q.empty() # 判断队列是否为空,True/False

    线程模块

    # 导入模块
    from threading import Thread
    
    # 使用流程  
    t = Thread(target=函数名) # 创建线程对象
    t.start() # 创建并启动线程
    t.join()  # 阻塞等待回收线程
    
    #如何创建多线程
    t_list=[]
    for i in range(5):
        t = Thread(target=函数名)
        t_list.append(t)
        t.start()
    for t in t_list:
        t.join()

    小米应用商店抓取(多线程)

    目标

    1、网址 :百度搜 - 小米应用商店,进入官网
    2、目标 :应用分类 - 聊天社交
       应用名称
       应用链接

    实现步骤

    1.确认是否为动态加载

    1、页面局部刷新
    2、右键查看网页源代码,搜索关键字未搜到
    # 此网站为动态加载网站,需要抓取网络数据包分析

    2.F12抓取网络数据包

    1、抓取返回json数据的URL地址(Headers中的Request URL)
       http://app.mi.com/categotyAllListApi?page={}&categoryId=2&pageSize=30
            
    2、查看并分析查询参数(headers中的Query String Parameters)
       page: 1
       categoryId: 2
       pageSize: 30
       # 只有page再变,0 1 2 3 ... ... ,这样我们就可以通过控制page的直拼接多个返回json数据的URL地址

    3.代码实现

    import requests
    from threading import Thread
    from queue import Queue
    import time
    from fake_useragent import UserAgent
    from lxml import etree
    
    class XiaomiSpider(object):
      def __init__(self):
        self.url = 'http://app.mi.com/categotyAllListApi?page={}&categoryId={}&pageSize=30'
        # 存放所有URL地址的队列
        self.q = Queue()
        self.ua = UserAgent()
        self.i = 0
        # 存放所有类型id的空列表
        self.id_list = []
    
      def get_cateid(self):
        # 请求
        url = 'http://app.mi.com/'
        headers = { 'User-Agent':self.ua.random }
        html = requests.get(url=url,headers=headers).text
        # 解析
        parse_html = etree.HTML(html)
        xpath_bds = '//ul[@class="category-list"]/li'
        li_list = parse_html.xpath(xpath_bds)
        for li in li_list:
          typ_name = li.xpath('./a/text()')[0]
          typ_id = li.xpath('./a/@href')[0].split('/')[-1]
          # 计算每个类型的页数
          pages = self.get_pages(typ_id)
          self.id_list.append( (typ_id,pages) )
    
        # 入队列
        self.url_in()
    
      # 获取counts的值并计算页数
      def get_pages(self,typ_id):
        # 每页返回的json数据中,都有count这个key
        url = self.url.format(0,typ_id)
        html = requests.get(
          url=url,
          headers={'User-Agent':self.ua.random}
        ).json()
        count = html['count']
        pages = int(count) // 30 + 1
    
        return pages
    
      # url入队列
      def url_in(self):
        for id in self.id_list:
          # id为元组,('2',pages)
          for page in range(id[1]):
            url = self.url.format(page,id[0])
            print(url)
            # 把URL地址入队列
            self.q.put(url)
    
      # 线程事件函数: get() - 请求 - 解析 - 处理数据
      def get_data(self):
        while True:
          if not self.q.empty():
            url = self.q.get()
            headers = {'User-Agent':self.ua.random}
            html = requests.get(url=url,headers=headers).json()
            self.parse_html(html)
          else:
            break
    
      # 解析函数
      def parse_html(self,html):
        for app in html['data']:
          # 应用名称
          name = app['displayName']
          link = 'http://app.mi.com/details?id=' + app['packageName']
          print(name,link)
          self.i += 1
    
      # 主函数
      def main(self):
        # URL入队列
        self.get_cateid()
        t_list = []
        # 创建多个线程
        for i in range(1):
          t = Thread(target=self.get_data)
          t_list.append(t)
          t.start()
    
        # 回收线程
        for t in t_list:
          t.join()
    
        print('数量:',self.i)
    
    if __name__ == '__main__':
      start = time.time()
      spider = XiaomiSpider()
      spider.main()
      end = time.time()
      print('执行时间:%.2f' % (end-start))
    代码实现

     腾讯招聘数据抓取

    1.确定url地址及目标

    1、URL: 百度搜索腾讯招聘 - 查看工作岗位
    2、目标: 职位名称、工作职责、岗位要求

    2.要求与分析

    1、通过查看网页源码,得知所需数据均为 Ajax 动态加载
    2、通过F12抓取网络数据包,进行分析
    3、一级页面抓取数据: 职位名称
    4、二级页面抓取数据: 工作职责、岗位要求

    3.一级页面json地址(index在变,timestamp未检查)

    https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1563912271089&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn

    4.二级页面地址(postId在变,在一级页面中可拿到)

    https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1563912374645&postId={}&language=zh-cn

    5.代码实现

    import requests
    import json
    import time
    import random
    from fake_useragent import UserAgent
    
    class TencentSpider(object):
      def __init__(self):
        self.one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1563912271089&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'
        self.two_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1563912374645&postId={}&language=zh-cn'
    
      # 获取User-Agent
      def get_headers(self):
        ua = UserAgent()
        headers = { 'User-Agent': ua.random }
        return headers
    
      # 获取响应内容函数
      def get_page(self,url):
        html = requests.get(url=url,headers=self.get_headers()).content.decode('utf-8','ignore')
        # json.loads()把json格式的字符串转为python数据类型
        html = json.loads(html)
        return html
    
      # 主线函数: 获取所有数据
      def parse_page(self,one_url):
        html = self.get_page(one_url)
        item = {}
        for job in html['Data']['Posts']:
          item['name'] = job['RecruitPostName']
          item['address'] = job['LocationName']
          # 拿postid为了拼接二级页面地址
          post_id = job['PostId']
          # 职责和要求(二级页面)
          two_url = self.two_url.format(post_id)
          item['duty'],item['requirement'] = self.parse_two_page(two_url)
          print(item)
    
      def parse_two_page(self,two_url):
        html = self.get_page(two_url)
        # 职责 + 要求
        duty = html['Data']['Responsibility']
        requirement = html['Data']['Requirement']
    
        return duty,requirement
    
      # 获取总页数
      def get_pages(self):
          url = self.one_url.format(1)
          html = self.get_page(url)
          pages = int(html['Data']['Count']) // 10 + 1
    
          return pages
    
      def main(self):
        # 总页数
        pages = self.get_pages()
        for index in range(1,pages):
          one_url = self.one_url.format(index)
          self.parse_page(one_url)
          time.sleep(random.uniform(0.5,1.5))
    
    if __name__ == '__main__':
      start = time.time()
      spider = TencentSpider()
      spider.main()
      end = time.time()
      print('执行时间:%.2f' % (end-start))
    代码实现

    6.多线程思路:

      把所有一级页面链接提交到队列,进行多线程数据抓取

    7.多线程代码实现

    import requests
    import json
    import time
    import random
    from fake_useragent import UserAgent
    from threading import Thread
    from queue import Queue
    
    class TencentSpider(object):
      def __init__(self):
        self.one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1563912271089&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'
        self.two_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1563912374645&postId={}&language=zh-cn'
        self.q = Queue()
        # 计数
        self.i = 0
    
    
      # 获取User-Agent
      def get_headers(self):
        ua = UserAgent()
        headers = { 'User-Agent': ua.random }
        return headers
    
      # 获取响应内容函数
      def get_page(self,url):
        html = requests.get(url=url,headers=self.get_headers()).content.decode('utf-8','ignore')
        # json.loads()把json格式的字符串转为python数据类型
        html = json.loads(html)
        return html
    
      # 主线函数: 获取所有数据
      def parse_page(self):
        while True:
            if not self.q.empty():
                one_url = self.q.get()
                html = self.get_page(one_url)
                item = {}
                for job in html['Data']['Posts']:
                  item['name'] = job['RecruitPostName']
                  item['address'] = job['LocationName']
                  # 拿postid为了拼接二级页面地址
                  post_id = job['PostId']
                  # 职责和要求(二级页面)
                  two_url = self.two_url.format(post_id)
                  item['duty'],item['requirement'] = self.parse_two_page(two_url)
                  print(item)
                  self.i += 1
                # 每爬取按完成1页随机休眠
                time.sleep(random.uniform(0,1))
            else:
                break
    
      def parse_two_page(self,two_url):
        html = self.get_page(two_url)
        # 职责 + 要求
        duty = html['Data']['Responsibility']
        requirement = html['Data']['Requirement']
    
        return duty,requirement
    
      # 获取总页数
      def get_pages(self):
          url = self.one_url.format(1)
          html = self.get_page(url)
          pages = int(html['Data']['Count']) // 10 + 1
    
          return pages
    
      def main(self):
        # one_url入队列
        pages = self.get_pages()
        for index in range(1,pages):
          one_url = self.one_url.format(index)
          self.q.put(one_url)
    
        t_list = []
        for i in range(5):
          t = Thread(target=self.parse_page)
          t_list.append(t)
          t.start()
    
        for t in t_list:
            t.join()
    
        print('数量:',self.i)
    
    if __name__ == '__main__':
      start = time.time()
      spider = TencentSpider()
      spider.main()
      end = time.time()
      print('执行时间:%.2f' % (end-start))
    多线程代码实现
  • 相关阅读:
    js中的计时器事件`setTimeout()` 和 `setInterval()`
    我的人生“意义”
    我活着的“形而上学”
    关于“我的”恶意
    不排他,与“我”的可能性
    【原创诗歌】自觉原理第十六章让你想照亮前程
    【原创诗歌】读书的梦:羡慕与期待
    【原创】楼兰老家
    【原创诗歌】青春咒语
    【原创】仓央嘉措,在心底活着
  • 原文地址:https://www.cnblogs.com/maplethefox/p/11352817.html
Copyright © 2020-2023  润新知