• 【Python爬虫案例学习】分析Ajax请求并抓取今日头条街拍图片


    1.抓取索引页内容
    利用requests请求目标站点,得到索引网页HTML代码,返回结果。

    from urllib.parse import urlencode
    from requests.exceptions import RequestException
    import requests
    '''
    遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
    '''
    def get_page_index(offset, keyword):
        headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }
        data = {
            'format': 'json',
            'offset': offset,
            'keyword': keyword,
            'autoload': 'true',
            'count': 20,
            'cur_tab': 1,
            'from': 'search_tab',
            'pd': 'synthesis',
        }
        url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
        response = requests.get(url, headers=headers);
        try:
            if response.status_code == 200:
                return response.text
            return None
        except RequestException: 
            print('请求索引页失败')
            return None
    
    def main():
        html = get_page_index(0,'街拍')
        print(html)
    
    if __name__=='__main__':
        main()
    

    2.抓取详情页内容
    解析返回结果,得到详情页的链接,并进一步抓取详情页的信息。

    • 获取页面网址:
    def parse_page_index(html):
      data = json.loads(html)
      if data and 'data' in data.keys():
        for item in data.get('data'):
          yield item.get('article_url')
    
    • 单个页面代码:
    def get_page_detail(url):
      headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }
      try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
          return response.text
        return None
      except RequestException:
        print('请求详情页页失败')
        return None
    
    • 图片地址
    def parse_page_detail(html,url):
      soup = BeautifulSoup(html,'lxml')
      title = soup.select('title')[0].get_text()
      images_pattern = re.compile('gallery: JSON.parse((.*?))', re.S)
      result = re.search(images_pattern, html)
      if result:
        data = json.loads(result.group(1))
        data = json.loads(data) #将字符串转为dict,因为报错了
        if data and 'sub_images' in data.keys():
          sub_images = data.get('sub_images')
          images = [item.get('url') for item in sub_images]
          for image in images: download_image(image)
          return {
            'title': title,
            'images':images,
            'url':url
          }
    

    3.下载图片与保存数据库
    将图片下载到本地,并把页面信息及图片URL保存到MongDB。

    # 存到数据库
    def save_to_mongo(result):
      if db[MONGO_TABLE].insert(result):
        print('存储到MongoDb成功', result)
        return True
      return False
    
    # 下载图片
    def download_image(url):
      print('正在下载',url)
      headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.    36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }
      try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
          save_image(response.content)
        return None
      except RequestException:
        print('请求图片失败', url)
        return None
    
    def save_image(content):
      file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
      if not os.path.exists(file_path):
        with open(file_path,'wb') as f:
          f.write(content)
    

    4.开启循环及多线程
    对多页内容遍历,开启多线程提高抓取速度。

    groups = [x*20 for x in range(GROUP_START, GROUP_END+1)]
        pool = Pool()
        pool.map(main,groups)
    

    完整代码:

    from urllib.parse import urlencode
    from requests.exceptions import RequestException
    from bs4 import BeautifulSoup
    from hashlib import md5
    from multiprocessing import Pool
    from config import *
    import pymongo
    import requests
    import json
    import re
    import os
    '''
    遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
    '''
    client = pymongo.MongoClient(MONGO_URL)
    db = client[MONGO_DB]
    
    def get_page_index(offset, keyword):
      headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }
      data = { 'format': 'json','offset': offset,'keyword': keyword,'autoload': 'true','count': 20,'cur_tab': 1,'from': 'search_tab','pd': 'synthesis' }
      url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
      try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
          return response.text
        return None
      except RequestException:
        print('请求索引页失败')
        return None
    
    def parse_page_index(html):
      data = json.loads(html)
      if data and 'data' in data.keys():
        for item in data.get('data'):
          yield item.get('article_url')
    
    def get_page_detail(url):
      headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }
      try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
          return response.text
        return None
      except RequestException:
        print('请求详情页页失败')
        return None
    
    def parse_page_detail(html,url):
      soup = BeautifulSoup(html,'lxml')
      title = soup.select('title')[0].get_text()
      images_pattern = re.compile('gallery: JSON.parse((.*?))', re.S)
      result = re.search(images_pattern, html)
      if result:
        data = json.loads(result.group(1))
        data = json.loads(data) #将字符串转为dict,因为报错了
        if data and 'sub_images' in data.keys():
          sub_images = data.get('sub_images')
          images = [item.get('url') for item in sub_images]
          for image in images: download_image(image)
          return {
            'title': title,
            'images':images,
            'url':url
          }
    
    def save_to_mongo(result):
      if db[MONGO_TABLE].insert(result):
        print('存储到MongoDb成功', result)
        return True
      return False
    
    def download_image(url):
      print('正在下载',url)
      headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.    36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' }
      try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
          save_image(response.content)
        return None
      except RequestException:
        print('请求图片失败', url)
        return None
    
    def save_image(content):
      file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
      if not os.path.exists(file_path):
        with open(file_path,'wb') as f:
          f.write(content)
    
    def main(offset):
      html = get_page_index(offset,KEYWORD)
      for url in parse_page_index(html):
         html = get_page_detail(url)
         if html:
           result = parse_page_detail(html,url)
           if isinstance(result,dict):
             save_to_mongo(result)
      
    if __name__=='__main__':
        groups = [x*20 for x in range(GROUP_START, GROUP_END+1)]
        pool = Pool()
        pool.map(main,groups)
    

    config.py

    MONGO_URL = 'localhost'
    MONGO_DB = 'toutiao'
    MONGO_TABLE = 'jiepai'
    
    GROUP_START = 1 
    GROUP_END = 20
    
    KEYWORD = '街拍'
    ~                 
    
  • 相关阅读:
    《那些年啊,那些事——一个程序员的奋斗史》——117
    《那些年啊,那些事——一个程序员的奋斗史》——116
    《那些年啊,那些事——一个程序员的奋斗史》——116
    《那些年啊,那些事——一个程序员的奋斗史》——118
    《那些年啊,那些事——一个程序员的奋斗史》——119
    《那些年啊,那些事——一个程序员的奋斗史》——117
    《那些年啊,那些事——一个程序员的奋斗史》——119
    283. Move Zeroes
    26. Remove Duplicates from Sorted Array
    268. Missing Number
  • 原文地址:https://www.cnblogs.com/Pythonmiss/p/11308800.html
Copyright © 2020-2023  润新知