• python爬取今日头条关键字图集


    1.访问搜索图集结果,获得json如下(右图为data的一条的详细内容).页面以Ajax呈现,每次请求20个图集,其中

    title     --- 图集名字

    artical_url  --- 图集的地址

    count    --- 图集图片数量

      

    2. 访问其中的图集 

       访问artical_url,获得图集图片详细信息,其中图片url为下载地址

    展现出爬虫关键部分,整体项目地址在https://github.com/GeoffreyHub/toutiao_spider

      1 #!/usr/bin/env python
      2 # encoding: utf-8
      3 
      4 """
      5 @version: python37
      6 @author: Geoffrey
      7 @file: spider.py
      8 @time: 18-10-24 上午11:15
      9 """
     10 import json
     11 import re
     12 from multiprocessing import Pool
     13 import urllib3
     14 urllib3.disable_warnings()
     15 from requests import RequestException
     16 
     17 from common.request_help import make_session
     18 from db.mysql_handle import MysqlHandler
     19 from img_spider.settings import *
     20 
     21 
     22 
     23 class SpiderTouTiao:
     24 
     25 
     26     def __init__(self, keyword):
     27         self.session = make_session(debug=True)
     28         self.url_index = 'https://www.toutiao.com/search_content/'
     29         self.keyword = keyword
     30         self.mysql_handler = MysqlHandler(MYSQL_CONFIG)
     31 
     32     def search_index(self, offset):
     33         url = self.url_index
     34         data = {
     35             'offset': f'{offset}',
     36             'format': 'json',
     37             'keyword': self.keyword,
     38             'autoload': 'true',
     39             'count': '20',
     40             'cur_tab': '3',
     41             'from': 'gallery'
     42         }
     43 
     44         try:
     45             response = self.session.get(url, params=data)
     46             if response.status_code is 200:
     47                 json_data = response.json()
     48                 with open(f'../json_data/搜索结果-{offset}.json', 'w', encoding='utf-8') as f:
     49                     json.dump(json_data, f, indent=4, ensure_ascii=False)
     50                 return self.get_gallery_url(json_data)
     51         except :
     52             pass
     53             print('请求失败')
     54 
     55     @staticmethod
     56     def get_gallery_url(json_data):
     57         dict_data = json.dumps(json_data)
     58         for info in json_data["data"]:
     59             title = info["title"]
     60             gallery_pic_count = info["gallery_pic_count"]
     61             article_url = info["article_url"]
     62             yield title, gallery_pic_count, article_url
     63 
     64     def gallery_list(self, search_data):
     65         gallery_urls = {}
     66         for title, gallery_pic_count, article_url in search_data:
     67             print(title, gallery_pic_count, article_url)
     68             response = self.session.get(article_url)
     69             html = response.text
     70             images_pattern = re.compile('gallery: JSON.parse("(.*?)"),', re.S)
     71             result = re.search(images_pattern, html)
     72 
     73             if result:
     74                 # result = result.replace('\', '')
     75                 # result = re.sub(r"\", '', result)
     76                 result = eval("'{}'".format(result.group(1)))
     77                 result = json.loads(result)
     78                 # picu_urls = zip(result["sub_abstracts"], result["sub_titles"], [url["url"] for url in result["sub_images"]])
     79                 picu_urls = zip(result["sub_abstracts"], [url["url"] for url in result["sub_images"]])
     80                 # print(list(picu_urls))
     81                 gallery_urls[title] = picu_urls
     82             else:
     83                 print('解析不到图片url')
     84 
     85             with open(f'../json_data/{title}-搜索结果.json', 'w', encoding='utf-8') as f:
     86                 json.dump(result, f, indent=4, ensure_ascii=False)
     87 
     88             break
     89 
     90         # print(gallery_urls)
     91         return gallery_urls
     92 
     93     def get_imgs(self, gallery_urls):
     94         params = []
     95         for title, infos in (gallery_urls.items()):
     96             for index, info  in enumerate(infos):
     97                 abstract, img_url = info
     98                 print(index, abstract)
     99                 response = self.session.get(img_url)
    100                 img_content = response.content
    101                 params.append([title, abstract, img_content])
    102 
    103                 with open(f'/home/geoffrey/图片/今日头条/{title}-{index}.jpg', 'wb') as f:
    104                     f.write(img_content)
    105 
    106                 SQL = 'insert into img_gallery(title, abstract, imgs) values(%s, %s, %s)'
    107                 self.mysql_handler.insertOne(SQL, [title, abstract, img_content])
    108                 self.mysql_handler.end()
    109 
    110         print(f'保存图集完成' + '-'*50 )
    111         # SQL = 'insert into img_gallery(title, abstract, imgs) values(%s, %s, %s)'
    112         # self.mysql_handler.insertMany(SQL, params)
    113         # self.mysql_handler.end()
    114 
    115 
    116 def main(offset):
    117     spider = SpiderTouTiao(KEY_WORD)
    118     search_data = spider.search_index(offset)
    119     gallery_urls = spider.gallery_list(search_data)
    120     spider.get_imgs(gallery_urls)
    121     spider.mysql_handler.dispose()
    122 
    123 
    124 if __name__ == '__main__':
    125     groups = [x*20 for x in range(GROUP_START, GROPE_END)]
    126 
    127     pool = Pool(10)
    128     pool.map(main, groups)
    129 
    130     # for i in groups:
    131     #     main(i)

    项目结构如下:

    .
    ├── common
    │ ├── __init__.py
    │ ├── __pycache__
    │ │ ├── __init__.cpython-37.pyc
    │ │ └── request_help.cpython-37.pyc
    │ ├── request_help.py
    ├── db
    │ ├── __init__.py
    │ ├── mysql_handle.py
    │ └── __pycache__
    │ ├── __init__.cpython-37.pyc
    │ └── mysql_handle.cpython-37.pyc
    ├── img_spider
    │ ├── __init__.py
    │ ├── __pycache__
    │ │ ├── __init__.cpython-37.pyc
    │ │ └── settings.cpython-37.pyc
    │ ├── settings.py
    │ └── spider.py
    └── json_data
    ├── 沐浴三里屯的秋-搜索结果.json
    ├── 盘点三里屯那些高逼格的苍蝇馆子-搜索结果.json
    ├── 搜索结果-0.json
    ├── 搜索结果-20.json
    ├── 搜索结果-40.json

  • 相关阅读:
    快速排序
    Java LinkedList
    Java ArrayList
    Java ReentrantLock
    Java Timer
    Java 管道PipedInputStream PipedOutStream PipedReader PipedWriter
    6. Samba服务和防火墙配置笔记
    5. 网络配置与FTP服务笔记
    3.vi 和软件安装
    2 . Linux常见命令
  • 原文地址:https://www.cnblogs.com/geoffreyone/p/9855602.html
Copyright © 2020-2023  润新知