• 爬取微信公众号文章


    搜狗对微信公众平台的公众号和文章做了整合,使用代理爬取。

    spider.py

      1 from urllib.parse import urlencode
      2 import pymongo
      3 import requests
      4 from lxml.etree import XMLSyntaxError
      5 from requests.exceptions import ConnectionError
      6 from pyquery import PyQuery as pq
      7 from config import *
      8 
      9 client = pymongo.MongoClient(MONGO_URI)
     10 db = client[MONGO_DB]
     11 
     12 base_url = 'http://weixin.sogou.com/weixin?'
     13 
     14 headers = {
     15 'Cookie': 'IPLOC=CN1100; SUID=194E796A2E08990A000000005B114E85; SUV=1527860869604056; ABTEST=1|1527860872|v1; SNUID=9FCBFCEF8680EB12510E6A9C86088B29; weixinIndexVisited=1; JSESSIONID=aaaqa95rD87Zu9-CJwlnw; sct=5; ppinf=5|1527862844|1529072444|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyNzolRTclOEUlOEIlRTclOTAlQjMlRTYlOUQlQjB8Y3J0OjEwOjE1Mjc4NjI4NDR8cmVmbmljazoyNzolRTclOEUlOEIlRTclOTAlQjMlRTYlOUQlQjB8dXNlcmlkOjQ0Om85dDJsdUh5bE5VSDJEVWNuSHBDWnVOVG9sN2tAd2VpeGluLnNvaHUuY29tfA; pprdig=EZE8CVVtoUTqmCoJj6bEWwKngY4di5UpGDFImTA9-1qrMK_tIJEtUyGR9_0Jcv5Xw1EuqLO9BNFvAKQv5DOQvmCWh-jxudk7SGv89NuhCLow7dxPysoOtLSI-keSaKVLKT82Vhg7rDBg0SlQ3y2uiG53lBUWL0wLVw4D_f_7MLg; sgid=17-35315605-AVsRVjwpV4ichpAzPibp6olGY; ppmdig=1527862844000000243bdb95cb03e086685bb1de06087c32',
     16 'Host': 'weixin.sogou.com',
     17 'Upgrade-Insecure-Requests': '1',
     18 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.7 Safari/537.36'
     19 }
     20 
     21 
     22 proxy = None
     23 
     24 
     25 def get_proxy():
     26     try:
     27         response = requests.get(PROXY_POOL_URL)
     28         if response.status_code == 200:
     29             return response.text
     30         return None
     31     except ConnectionError:
     32         return None
     33 
     34 def get_html(url, count=1):
     35     print('Crawling', url)
     36     print('Trying Count', count)
     37     global proxy
     38     if count >= MAX_COUNT:
     39         print('Tried Too Many Counts')
     40         return None
     41     try:
     42         if proxy:
     43             proxies = {
     44                 'http': 'http://' + proxy
     45             }
     46             response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies)
     47         else:
     48             response = requests.get(url, allow_redirects=False, headers=headers)
     49         if response.status_code == 200:
     50             return response.text
     51         if response.status_code == 302:
     52             # Need Proxy
     53             print('302')
     54             proxy = get_proxy()
     55             if proxy:
     56                 print('Using Proxy', proxy)
     57                 #count += 1
     58                 #return get_html(url, count)
     59                 return get_html(url)
     60             else:
     61                 print('Get Proxy Failed')
     62                 return None
     63     except ConnectionError as e:
     64         print('Error Occurred', e.args)
     65         proxy = get_proxy()
     66         count += 1
     67         return get_html(url, count)
     68 
     69 
     70 
     71 def get_index(keyword, page):
     72     data = {
     73         'query': keyword,
     74         'type': 2,
     75         'page': page
     76     }
     77     queries = urlencode(data)
     78     url = base_url + queries
     79     html = get_html(url)
     80     return html
     81 
     82 def parse_index(html):
     83     doc = pq(html)
     84     items = doc('.news-box .news-list li .txt-box h3 a').items()
     85     for item in items:
     86         yield item.attr('href')
     87 
     88 def get_detail(url):
     89     try:
     90         response = requests.get(url)
     91         if response.status_code == 200:
     92             return response.text
     93         return None
     94     except ConnectionError:
     95         return None
     96 
     97 def parse_detail(html):
     98     try:
     99         doc = pq(html)
    100         title = doc('.rich_media_title').text()
    101         content = doc('.rich_media_content').text()
    102         date = doc('#publish_time').text()
    103         nickname = doc('#js_profile_qrcode > div > strong').text()
    104         wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
    105         return {
    106             'title': title,
    107             'content': content,
    108             'date': date,
    109             'nickname': nickname,
    110             'wechat': wechat
    111         }
    112     except XMLSyntaxError:
    113         return None
    114 
    115 def save_to_mongo(data):
    116     if db['articles'].update({'title': data['title']}, {'$set': data}, True):
    117         print('Saved to Mongo', data['title'])
    118     else:
    119         print('Saved to Mongo Failed', data['title'])
    120 
    121 
    122 def main():
    123     for page in range(1, 101):
    124         html = get_index(KEYWORD, page)
    125         if html:
    126             article_urls = parse_index(html)
    127             for article_url in article_urls:
    128                 #print(article_url)
    129                 article_html = get_detail(article_url)
    130                 if article_html:
    131                     article_data = parse_detail(article_html)
    132                     print(article_data)
    133                     if article_data:
    134                         save_to_mongo(article_data)
    135 
    136 
    137 
    138 if __name__ == '__main__':
    139     main()
    config.py
    1 PROXY_POOL_URL = 'http://127.0.0.1:5555/random'
    2 KEYWORD = 'python'
    3 MONGO_URI = 'localhost'
    4 MONGO_DB = 'weixin'
    5 MAX_COUNT = 5
  • 相关阅读:
    学习也好,科研也罢,都有内在规律。任何事物,只消抓住规律,就等于牵住牛鼻子
    赵伟国:陆资无法进入台湾紫光要到WTO控告(芯片是为了经济安全,高通找的人不是很聪明)
    小米新旗舰“翻车” 冲击中高端凸显品控短板(小米的缺点还真不少:电商、性价比、爆款、粉丝经济,说到底也都只是商业上的创新)
    WinRarHelper帮助类
    Window7下安装Ubuntu 14.04 64bit
    Kafka基本原理
    Abot爬虫和visjs
    CLR垃圾回收的设计
    NET Core全新的配置管理
    Github Atom
  • 原文地址:https://www.cnblogs.com/wanglinjie/p/9231559.html
Copyright © 2020-2023  润新知