爬取微信公众号文章

搜狗对微信公众平台的公众号和文章做了整合，使用代理爬取。

spider.py

  1 from urllib.parse import urlencode
  2 import pymongo
  3 import requests
  4 from lxml.etree import XMLSyntaxError
  5 from requests.exceptions import ConnectionError
  6 from pyquery import PyQuery as pq
  7 from config import *
  8 
  9 client = pymongo.MongoClient(MONGO_URI)
 10 db = client[MONGO_DB]
 11 
 12 base_url = 'http://weixin.sogou.com/weixin?'
 13 
 14 headers = {
 15 'Cookie': 'IPLOC=CN1100; SUID=194E796A2E08990A000000005B114E85; SUV=1527860869604056; ABTEST=1|1527860872|v1; SNUID=9FCBFCEF8680EB12510E6A9C86088B29; weixinIndexVisited=1; JSESSIONID=aaaqa95rD87Zu9-CJwlnw; sct=5; ppinf=5|1527862844|1529072444|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyNzolRTclOEUlOEIlRTclOTAlQjMlRTYlOUQlQjB8Y3J0OjEwOjE1Mjc4NjI4NDR8cmVmbmljazoyNzolRTclOEUlOEIlRTclOTAlQjMlRTYlOUQlQjB8dXNlcmlkOjQ0Om85dDJsdUh5bE5VSDJEVWNuSHBDWnVOVG9sN2tAd2VpeGluLnNvaHUuY29tfA; pprdig=EZE8CVVtoUTqmCoJj6bEWwKngY4di5UpGDFImTA9-1qrMK_tIJEtUyGR9_0Jcv5Xw1EuqLO9BNFvAKQv5DOQvmCWh-jxudk7SGv89NuhCLow7dxPysoOtLSI-keSaKVLKT82Vhg7rDBg0SlQ3y2uiG53lBUWL0wLVw4D_f_7MLg; sgid=17-35315605-AVsRVjwpV4ichpAzPibp6olGY; ppmdig=1527862844000000243bdb95cb03e086685bb1de06087c32',
 16 'Host': 'weixin.sogou.com',
 17 'Upgrade-Insecure-Requests': '1',
 18 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.7 Safari/537.36'
 19 }
 20 
 21 
 22 proxy = None
 23 
 24 
 25 def get_proxy():
 26     try:
 27         response = requests.get(PROXY_POOL_URL)
 28         if response.status_code == 200:
 29             return response.text
 30         return None
 31     except ConnectionError:
 32         return None
 33 
 34 def get_html(url, count=1):
 35     print('Crawling', url)
 36     print('Trying Count', count)
 37     global proxy
 38     if count >= MAX_COUNT:
 39         print('Tried Too Many Counts')
 40         return None
 41     try:
 42         if proxy:
 43             proxies = {
 44                 'http': 'http://' + proxy
 45             }
 46             response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies)
 47         else:
 48             response = requests.get(url, allow_redirects=False, headers=headers)
 49         if response.status_code == 200:
 50             return response.text
 51         if response.status_code == 302:
 52             # Need Proxy
 53             print('302')
 54             proxy = get_proxy()
 55             if proxy:
 56                 print('Using Proxy', proxy)
 57                 #count += 1
 58                 #return get_html(url, count)
 59                 return get_html(url)
 60             else:
 61                 print('Get Proxy Failed')
 62                 return None
 63     except ConnectionError as e:
 64         print('Error Occurred', e.args)
 65         proxy = get_proxy()
 66         count += 1
 67         return get_html(url, count)
 68 
 69 
 70 
 71 def get_index(keyword, page):
 72     data = {
 73         'query': keyword,
 74         'type': 2,
 75         'page': page
 76     }
 77     queries = urlencode(data)
 78     url = base_url + queries
 79     html = get_html(url)
 80     return html
 81 
 82 def parse_index(html):
 83     doc = pq(html)
 84     items = doc('.news-box .news-list li .txt-box h3 a').items()
 85     for item in items:
 86         yield item.attr('href')
 87 
 88 def get_detail(url):
 89     try:
 90         response = requests.get(url)
 91         if response.status_code == 200:
 92             return response.text
 93         return None
 94     except ConnectionError:
 95         return None
 96 
 97 def parse_detail(html):
 98     try:
 99         doc = pq(html)
100         title = doc('.rich_media_title').text()
101         content = doc('.rich_media_content').text()
102         date = doc('#publish_time').text()
103         nickname = doc('#js_profile_qrcode > div > strong').text()
104         wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
105         return {
106             'title': title,
107             'content': content,
108             'date': date,
109             'nickname': nickname,
110             'wechat': wechat
111         }
112     except XMLSyntaxError:
113         return None
114 
115 def save_to_mongo(data):
116     if db['articles'].update({'title': data['title']}, {'$set': data}, True):
117         print('Saved to Mongo', data['title'])
118     else:
119         print('Saved to Mongo Failed', data['title'])
120 
121 
122 def main():
123     for page in range(1, 101):
124         html = get_index(KEYWORD, page)
125         if html:
126             article_urls = parse_index(html)
127             for article_url in article_urls:
128                 #print(article_url)
129                 article_html = get_detail(article_url)
130                 if article_html:
131                     article_data = parse_detail(article_html)
132                     print(article_data)
133                     if article_data:
134                         save_to_mongo(article_data)
135 
136 
137 
138 if __name__ == '__main__':
139     main()

config.py

1 PROXY_POOL_URL = 'http://127.0.0.1:5555/random'
2 KEYWORD = 'python'
3 MONGO_URI = 'localhost'
4 MONGO_DB = 'weixin'
5 MAX_COUNT = 5

相关阅读:
学习也好，科研也罢，都有内在规律。任何事物，只消抓住规律，就等于牵住牛鼻子
 赵伟国：陆资无法进入台湾紫光要到WTO控告（芯片是为了经济安全，高通找的人不是很聪明）
小米新旗舰“翻车” 冲击中高端凸显品控短板（小米的缺点还真不少：电商、性价比、爆款、粉丝经济，说到底也都只是商业上的创新）
WinRarHelper帮助类
 Window7下安装Ubuntu 14.04 64bit
Kafka基本原理
 Abot爬虫和visjs
CLR垃圾回收的设计
 NET Core全新的配置管理
 Github Atom
原文地址：https://www.cnblogs.com/wanglinjie/p/9231559.html