• 水木清华小爬虫


    Flask+微信公众号+爬虫 定时爬去水木清华招聘信息

    Java定时器有Quartz,Python里面有apscheduler

    import requests
    from datetime import datetime
    from pyquery import PyQuery as pq
    import time
    
    board_list = ['ITjob', 'Intern', 'ParttimeJobPost']
    base_url = 'http://www.newsmth.net/'
    data = dict()
    num = []  # 编号
    now = None
    
    
    def get_content(url):
        resp = get(url)
        resp.encoding = 'gbk'
        html = pq(resp.text)
        content = html('.b-content')('.a-content').eq(0).html()
        content = content.replace('<br>', '
    ').replace('<p>', '
    ').replace('</p>', '
    ')
        content = pq(content).text()
        return content
    
    
    def get_board(board):
        resp = get(base_url + 'nForum/board/%s?ajax' % board)
        resp.encoding = 'gbk'
        trs = pq(resp.text)('tbody')('tr').not_('.top')
        for i in range(len(trs)):
            tds = trs.eq(i).find('td')
            t = tds.eq(2).text()
            if ':' in t:
                t = time.strptime(time.strftime('%Y:%m:%d:', time.localtime()) + t, '%Y:%m:%d:%H:%M:%S')
            else:
                continue
            it = dict()
            it['time'] = t
            it['title'] = tds.eq(1).text()
            url = tds.eq(1).find('a').attr('href')
            if url in data:
                continue
            it['id'] = url
            it['content'] = get_content(base_url + url)
            data[url] = it
        return data
    
    
    def get(url):
        for i in range(3):
            try:
                resp = requests.get(url)
                if resp.status_code == 200:
                    return resp
            except:
                continue
    
    
    def scrawl_data():
        global now, num
        print(scrawl_data.__name__)
        if datetime.now().hour == 0:
            data.clear()
        for i in board_list:
            get_board(i)
        n = 0
        now = ''
        num = []
        for i in data:
            num.append(data[i])
            now += '%d %s
    ' % (n, data[i]['title'])
            n += 1
        print(scrawl_data.__name__, 'over')
    
    
    import hashlib
    import re
    import time
    import xml.etree.ElementTree as ET
    from datetime import datetime
    
    import apscheduler.schedulers.background as bg
    from flask import Flask, request
    
    import scrawl
    
    app = Flask(__name__)
    app.debug = True
    
    
    @app.route('/', methods=['GET', 'POST'])
    def haha():
        if request.method == 'GET':
            token = '20124003'  # 微信配置所需的token
            signature = request.args.get('signature', '')
            timestamp = request.args.get('timestamp', '')
            nonce = request.args.get('nonce', '')
            echostr = request.args.get('echostr', '')
            s = ''.join(sorted([timestamp, nonce, token]))
            sha1 = hashlib.sha1()
            sha1.update(bytes(s, "utf8"))
            if sha1.hexdigest() == signature:
                return echostr
        else:
            xml = ET.fromstring(request.data)
            toUser = xml.find('ToUserName').text
            fromUser = xml.find('FromUserName').text
            msgType = xml.find("MsgType").text
            createTime = xml.find("CreateTime")
            if msgType == "text":
                content = xml.find('Content').text
                return reply_text(fromUser, toUser, reply(fromUser, content))
            else:
                return reply_text(fromUser, toUser, "我只懂文字")
    
    
    @app.route("/ok")
    def ok():
        return 'i am ok'
    
    
    @app.route("/what")
    def what():
        return str(scrawl.data)
    
    
    def reply_text(to_user, from_user, content):
        """
        以文本类型的方式回复请求
        :param to_user: 
        :param from_user: 
        :param content: 
        :return: 
        """
        return """
        <xml>
            <ToUserName><![CDATA[{}]]></ToUserName>
            <FromUserName><![CDATA[{}]]></FromUserName>
            <CreateTime>{}</CreateTime>
            <MsgType><![CDATA[text]]></MsgType>
            <Content><![CDATA[{}]]></Content>
        </xml>
        """.format(to_user, from_user,
                   int(time.time() * 1000), content)
    
    
    def reply(openid, msg):
        # 简单地翻转一下字符串就回复用户
        if re.match('d+', msg):
            x = int(msg)
            if x < len(scrawl.num):
                return scrawl.num[x]['content']
        else:
            return scrawl.now
    
    
    # 不能将初始化部分放在main部分,gunicorn运行时不执行main部分
    scheduler = bg.BackgroundScheduler()
    scheduler.add_job(scrawl.scrawl_data, 'interval', hours=1, next_run_time=datetime.now())
    scheduler.start()
    if __name__ == '__main__':
        app.run()
    
    
  • 相关阅读:
    mybatis自动生成代码配置文件
    Struts2的类型转换器
    CSS布局自适应高度终极方法
    Winform WebBrowser控件对访问页面执行、改写、添加Javascript代码
    利用using语句解决Lock抛出异常时发生死锁的问题
    Flash与Silverlight终极大比拼
    System.Collections.Specialized.NameValueCollection PostVars
    Hook浏览器控件WebBrowser对WININET.dll的调用
    WebBrowser中打开新页面保留sessionid
    Linksys路由器自动重启加流量
  • 原文地址:https://www.cnblogs.com/weiyinfu/p/9210291.html
Copyright © 2020-2023  润新知