Flask+微信公众号+爬虫 定时爬去水木清华招聘信息
Java定时器有Quartz,Python里面有apscheduler
import requests
from datetime import datetime
from pyquery import PyQuery as pq
import time
board_list = ['ITjob', 'Intern', 'ParttimeJobPost']
base_url = 'http://www.newsmth.net/'
data = dict()
num = [] # 编号
now = None
def get_content(url):
resp = get(url)
resp.encoding = 'gbk'
html = pq(resp.text)
content = html('.b-content')('.a-content').eq(0).html()
content = content.replace('<br>', '
').replace('<p>', '
').replace('</p>', '
')
content = pq(content).text()
return content
def get_board(board):
resp = get(base_url + 'nForum/board/%s?ajax' % board)
resp.encoding = 'gbk'
trs = pq(resp.text)('tbody')('tr').not_('.top')
for i in range(len(trs)):
tds = trs.eq(i).find('td')
t = tds.eq(2).text()
if ':' in t:
t = time.strptime(time.strftime('%Y:%m:%d:', time.localtime()) + t, '%Y:%m:%d:%H:%M:%S')
else:
continue
it = dict()
it['time'] = t
it['title'] = tds.eq(1).text()
url = tds.eq(1).find('a').attr('href')
if url in data:
continue
it['id'] = url
it['content'] = get_content(base_url + url)
data[url] = it
return data
def get(url):
for i in range(3):
try:
resp = requests.get(url)
if resp.status_code == 200:
return resp
except:
continue
def scrawl_data():
global now, num
print(scrawl_data.__name__)
if datetime.now().hour == 0:
data.clear()
for i in board_list:
get_board(i)
n = 0
now = ''
num = []
for i in data:
num.append(data[i])
now += '%d %s
' % (n, data[i]['title'])
n += 1
print(scrawl_data.__name__, 'over')
import hashlib
import re
import time
import xml.etree.ElementTree as ET
from datetime import datetime
import apscheduler.schedulers.background as bg
from flask import Flask, request
import scrawl
app = Flask(__name__)
app.debug = True
@app.route('/', methods=['GET', 'POST'])
def haha():
if request.method == 'GET':
token = '20124003' # 微信配置所需的token
signature = request.args.get('signature', '')
timestamp = request.args.get('timestamp', '')
nonce = request.args.get('nonce', '')
echostr = request.args.get('echostr', '')
s = ''.join(sorted([timestamp, nonce, token]))
sha1 = hashlib.sha1()
sha1.update(bytes(s, "utf8"))
if sha1.hexdigest() == signature:
return echostr
else:
xml = ET.fromstring(request.data)
toUser = xml.find('ToUserName').text
fromUser = xml.find('FromUserName').text
msgType = xml.find("MsgType").text
createTime = xml.find("CreateTime")
if msgType == "text":
content = xml.find('Content').text
return reply_text(fromUser, toUser, reply(fromUser, content))
else:
return reply_text(fromUser, toUser, "我只懂文字")
@app.route("/ok")
def ok():
return 'i am ok'
@app.route("/what")
def what():
return str(scrawl.data)
def reply_text(to_user, from_user, content):
"""
以文本类型的方式回复请求
:param to_user:
:param from_user:
:param content:
:return:
"""
return """
<xml>
<ToUserName><![CDATA[{}]]></ToUserName>
<FromUserName><![CDATA[{}]]></FromUserName>
<CreateTime>{}</CreateTime>
<MsgType><![CDATA[text]]></MsgType>
<Content><![CDATA[{}]]></Content>
</xml>
""".format(to_user, from_user,
int(time.time() * 1000), content)
def reply(openid, msg):
# 简单地翻转一下字符串就回复用户
if re.match('d+', msg):
x = int(msg)
if x < len(scrawl.num):
return scrawl.num[x]['content']
else:
return scrawl.now
# 不能将初始化部分放在main部分,gunicorn运行时不执行main部分
scheduler = bg.BackgroundScheduler()
scheduler.add_job(scrawl.scrawl_data, 'interval', hours=1, next_run_time=datetime.now())
scheduler.start()
if __name__ == '__main__':
app.run()