• Python 爬虫实例(2)—— 爬取今日头条


    # coding:utf-8
    import base64
    import random, re
    import sqlite3
    import redis, pickle
    import json, time
    import urllib3,urllib2,hashlib
    from datetime import datetime
    import threading
    import logging.handlers
    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    import uuid
    
    import requests
    
    session = requests.session()
    #把连接加密成 MD5 生成唯一的主键
    def md5(str):
        import hashlib
        m = hashlib.md5()
        m.update(str)
        return m.hexdigest()
    
    
    
    
    def jinri():
        list_data  = []
        for i in range(1,20):
           #请求得到url 链接
            url = "http://www.toutiao.com/api/pc/feed/"
            data = {
    
                "category":"news_game",
                "utm_source":"toutiao",
                "widen":str(i),
                "max_behot_time":"0",
                "max_behot_time_tmp":"0",
                "tadrequire":"true",
                "as":"479BB4B7254C150",
                "cp":"7E0AC8874BB0985",
            }
            headers = {
    
                    "Host":"www.toutiao.com",
                    "Connection":"keep-alive",
                    "Accept":"text/javascript, text/html, application/xml, text/xml, */*",
                    "X-Requested-With":"XMLHttpRequest",
                    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
                    "Content-Type":"application/x-www-form-urlencoded",
                    "Referer":"http://www.toutiao.com/ch/news_hot/",
                    "Accept-Encoding":"gzip, deflate",
                    "Accept-Language":"zh-CN,zh;q=0.8",
    
            }
    
            result1 = session.get(url=url,params=data,headers=headers).text
            result2 =json.loads(result1)
            if result2["message1"] =="success":
                
                for i in result2["data"]:
                    source_url =i["source_url"]
    
                    headers = {
    
                        "Host":"www.toutiao.com",
                        "Connection":"keep-alive",
                        "Cache-Control":"max-age=0",
                        "Upgrade-Insecure-Requests":"1",
                        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
                        "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                        "Accept-Encoding":"gzip, deflate",
                        "Accept-Language":"zh-CN,zh;q=0.8",
                    }
                    url1 = "http://www.toutiao.com" + str(source_url)
                    try:
    
                        return_data = session.get(url=url1, headers=headers).content
                    except:
                        pass
                    # print return_data
                    try:
                        contentData = re.findall(' <article>(.*?)</article>',return_data)[0]
                    except:
                        contentData = ""
    
                    cx = sqlite3.connect("C:\Users\xuchunlin\PycharmProjects\study\db.sqlite3",check_same_thread=False)
                    cx.text_factory = str
    
                    try:
                        print "正在插入链接   %s   数据" % (url)
    
                        chinese_ta = i["chinese_tag"]
                        media_avatar_url = i["media_avatar_url"]
                        is_feed_ad = i["is_feed_ad"]
                        tag_url = i["tag_url"]
                        title = i["title"]
                        tag = i["tag"]
                        label = str(i["label"])
                        abstract = i["abstract"]
                        source_url = i["source_url"]
    
    
                        print title
                        print chinese_ta
                        print media_avatar_url
                        print is_feed_ad
                        print tag_url
                        print tag
                        print label
                        print abstract
                        print source_url
    
                        url2 = md5(str(url1))
    
                        cx.execute("INSERT INTO toutiao (title,chinese_ta,media_avatar_url,is_feed_ad,tag_url,tag,label,abstract,source_url,url,contentData)VALUES (?,?,?,?,?,?,?,?,?,?,?)",
                            (str(title), str(chinese_ta), str(media_avatar_url), str(is_feed_ad), str(tag_url), str(tag), str(label), str(abstract), str(source_url), str(url2),str(contentData)))
                        cx.commit()
    
                        # time.sleep(2)
                    except Exception as e:
                        print e
                        print "cha ru shi bai "
    
                    cx.close()
    
            else:
                print "请求失败"
    
        return list_data
    
    
    print jinri()

    爬虫很简单,难的是自己去分析网页解析网页和爬虫的效率

  • 相关阅读:
    *** 疑问
    *** C++动态绑定(多态)example code 1
    *** C++纯虚函数使用example code 1
    *** C++实现string各类运算符重载成员函数
    *** C++虚函数表原理相关代码
    *** 自写MyString类:重载操作符 '+'
    *** [转]C++在类的成员函数中,允许直接访问该类的成员对象的私有成员变量
    *** [转] C++基础篇--overload重载&override覆盖&overwrite隐藏
    *** C++ 中的函数重载
    *** 关于虚函数的一些常见问题
  • 原文地址:https://www.cnblogs.com/xuchunlin/p/7097295.html
Copyright © 2020-2023  润新知