• 崔庆才——Ajax今日头条多进程爬虫


    # 从AJAX入手解决主页面无数据问题
    import requests
    from hashlib import md5
    import os
    from config import *
    from requests.exceptions import RequestException
    from urllib.parse import urlencode
    import json
    from bs4 import BeautifulSoup
    from multiprocessing import Pool
    import re
    import pymongo
    client = pymongo.MongoClient(MONGO_URL)
    db = client[MONGO_DB]
    # Mongo服务要先启动,并创建你选定数据库,否则连接不上
    def save_to_mongo(ret_dict):
    if db[MONGO_TABLE].insert(ret_dict): # 知识点8:mongodb数据库的链接,配置文件方式传入
    print("插入数据到数据库成功", ret_dict["title"])
    return True
    return False
    def get_page_index(keyword,offset,headers,cookie):
    try:
    data = {
    'aid':"24",
    'app_name':"web_search",
    'offset':offset,
    'format':"json",
    'keyword':keyword,
    'autoload':"true",
    'count':"20",
    'en_qc':"1",
    'cur_tab':"1",
    'from':'search_tab',
    'pd':"synthesis",
    'timestamp':"1585525783382",
    '_signature':"MqqdBAAgEBC1BxnpKjcMhjKr3BAAGwyzftELDyc2Vi7Ug4gGwX7WlzBBtoBfhTP9rT-Eha5MhBFoxSsOVuYXGF4F1L2sGmX9A07QT2rsGhAXHp38jFF3LG2nRBQu9o52X09"

    }
    # urllib库的编码方式
    url = "https://www.toutiao.com/api/search/content/?"+urlencode(data)
    response = requests.get(url,headers=headers,cookies=cookie)
    if response.status_code == 200:
    return response.text
    return None
    except RequestException:
    print("Wrong!请求索引失败")
    return None
    # 拿url
    def parse_page_index(html):
    """构造生成器即可,或者这个函数的返回值是一个列表"""
    data = json.loads(html)
    if data and "data" in data.keys():
    for item in data.get("data"): # 知识点3:字典获取键的值的get方法
    if "article_url" in item.keys():
    url = item.get("article_url")
    yield url
    # 拿组图细节
    def get_page_detail(url,headers,cookie):
    try:
    response = requests.get(url,headers=headers,cookies=cookie)
    if response.status_code == 200:
    content = response.content.decode()
    return content
    return None
    except RequestException:
    print("get函数出错")
    return None

    # 下载图片
    def download(url,headers,cookie):
    print("正在下载图片",url)
    try:
    response = requests.get(url, headers=headers, cookies=cookie)
    if response.status_code == 200:
    content = response.content
    saveimg(content)
    return None
    except RequestException:
    print("请求出错")
    return None
    # 保存图片
    def saveimg(content):
    file_path = "{0}/{1}.{2}".format(os.getcwd(),md5(content).hexdigest(),"jpg") # 知识点9:运用md5进行去重,md5的简单回顾
    if not os.path.exists(file_path): # 知识点10:os方法的使用
    with open(file_path,"wb") as f:
    f.write(content)
    def parse_page_detail(html, url,headers,cookie):
    soup = BeautifulSoup(html, 'lxml')
    result = soup.select('title')

    title = result[0].get_text() if result else ''
    # print(title)
    images_pattern = re.compile('gallery: JSON.parse("(.*)")', re.S)
    result = re.search(images_pattern, html)
    # print(result)
    if result:
    ret = result.group(1)
    ret = ret.replace("\", "")
    ret = ret.replace("u002F", "/")
    data = json.loads(ret)
    if data and 'sub_images' in data.keys():
    sub_images = data.get('sub_images')
    # print(sub_images)
    images = [item.get('url') for item in sub_images]
    for image in images: download(image,headers,cookie)
    return {
    'title': title,
    'url': url,
    'images': images
    }

    def main(offset):
    headers = {'user-agent':'xxx'}
    cookie = {'cookie':'xxx'}
    html = get_page_index("街拍",offset,headers,cookie)
    for url in parse_page_index(html):
    html = get_page_detail(url,headers,cookie)

    if html:
    result = parse_page_detail(html, url,headers,cookie)
    if result:
    print(result)
    save_to_mongo(result)
    if __name__ == "__main__":
    # main()
    groups = [x*20 for x in range(GROUP_START,GROUP_END+1)]
    pool = Pool()
    pool.map(main,groups)
  • 相关阅读:
    图解JAVA 垃圾回收机制(转)
    创建oracle数据库链路 dbLink
    在Windows 7使用Connectify来创建wifi热点的详细过程
    关于WinCe托盘做法
    运行项目提示oracle 需要安装 8.17 版本以上
    .js 兼容 FireFox 和 IE 键盘事件
    关于在右下脚弹出窗口javascript代码,并用ASP.NET在后台调用(原创)
    数组与指针
    fastjson漏洞汇总学习
    不要点,未写完!DNSlog资产外连
  • 原文地址:https://www.cnblogs.com/Knight66666/p/12597575.html
Copyright © 2020-2023  润新知