• python爬虫优化和错误日志分析


    发现问题

    在爬虫下载过程中,执行一段时间后都会异常终止,下次必须kill掉进程重新运行 ,看能否优化并减少手动操作

    错误日志分析

    收集了nohup.out文件,发现主要错误是的数组下标越界,推测可能的问题为:
    1)网络不稳定,http请求不通。
    2)网络请求成功,但是html表单解析失败。
    3)登录的cookie过期

    优化思路

    在所有有网络请求的地方,都加上了返回码是不是200的判断,然后html表单解析的地方加上数组长度判断,异常处理等

    源码如下

    import socket
    import time
    import os
    from datetime import datetime
    import re
    import yaml
    import requests
    from bs4 import BeautifulSoup
    
    # 设置超时时间为10s
    socket.setdefaulttimeout(10)
    s = requests.Session()
    
    
    # 登录
    def login():
        url = host_url + "j_spring_security_check"
    
        data = {
            "username": bzh_host_usr,
            "password": bzh_host_pwd
        }
    
        try:
            response = s.post(url, data=data, headers=headers)
            if response.status_code == 200:
                cookie = response.cookies.get_dict()
                print("login success")
                return cookie
        except Exception as e:
            print("login fail:", e)
    
    
    # 页码
    def get_pages():
        try:
            response = s.get(noticeListUrl, data=paramsNotice, headers=headers, cookies=cookie)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")
                pageCount = int(soup.find('span', id='PP_countPage').get_text())
                pageCount = pageCount if pageCount > 1 else 1
                return pageCount
        except Exception as e:
            print("get page_count fail:", e)
    
    
    # 文档ids
    def get_ids(pageCount):
        ids = []
        for p in range(int(pageCount)):
            paramsNotice['pageIndex'] = p + 1
    
            try:
                response = s.get(noticeListUrl, data=paramsNotice, headers=headers, cookies=cookie)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, "html.parser")
                    trs = soup.find("table", class_='ObjectListPage').tbody.find_all("tr")
                    regex = re.compile(r"noticeId=(d+)")
    
                    for tr in trs:
                        if (tr.text.find("标准化文档更新") > 0):
                            id = regex.findall(str(tr))[0]
                            ids.append(id)
                            print("bzh id:" + id)
    
                            last_update = tr.find_all("td")[1].get_text().strip()
                            date_format = time.strftime("%Y%m%d", time.strptime(last_update, "%Y-%m-%d %H:%M:%S"))
                            file_name = "标准化文档-" + date_format + ".rar"
    
                            crawlFile(id, file_name)
    
            except Exception as e:
                print("get ids fail:", e)
    
        return ids
    
    
    # 下载
    def crawlFile(id, file_name):
        down_url = noticeURL + id
        metaFile = "./bzh/" + file_name
    
        response = s.get(down_url, headers=headers, cookies=cookie)
        content = response.headers.get('Content-Disposition')
        filename = content[content.find('=') + 1:]
        filename = filename.encode('iso-8859-1').decode('GBK')
    
        print("remote:" + filename)
    
        try:
            f = open(metaFile, 'wb')
            f.write(response.content)
            f.close()
    
            print(file_name + " first download success")
            exit(0)
        except Exception as e:
            print(file_name + " download fail", e)
    
    
    if __name__ == "__main__":
        yaml_path = os.path.join('../', 'config.yaml')
        with open(yaml_path, 'r') as f:
            config = yaml.load(f, Loader=yaml.FullLoader)
    
        host_url = config['host_url']
        noticeListUrl = host_url + config['noticeListUrl']
        noticeDetailUrl = host_url + config['noticeDetailUrl']
        noticeURL = host_url + config['noticeURL']
    
        bzh_host_usr = config['bzh_host_usr']
        bzh_host_pwd = config['bzh_host_pwd']
        table_meta_bg_date = config['table_meta_bg_date']
    
        # header头信息
        headers = {
            "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
            "Referer": host_url + "login.jsp"
        }
    
        paramsNotice = {
            "queryStartTime": table_meta_bg_date
        }
    
        task_begin = datetime.now()
        print("Crawler begin time:" + str(task_begin))
    
        cookie = login()
        if cookie == "":
            print("cookie is null")
            exit(0)
    
        pageCount = get_pages()
    
        pageCount = 2
        if pageCount < 1:
            print("page < 1")
            exit(0)
    
        ids = get_ids(pageCount)
    
        task_end = datetime.now()
        print("Crawler end time:" + str(task_end))
    
    

    执行结果分析

    优化后的爬虫运行正常,之前的异常已被捕获,输出在error日志里。
    更新过的代码在线上环境跑了4天,收集了4天的错误日志,想从时间点上观察,看能否继续优化。

    源码如下

    import os
    import matplotlib.pyplot as plt
    
    if __name__ == "__main__":
        print("analyze error of bzh crawler")
    
        error_con = {}
        error_html = {}
        for i in range(0, 24):
            key = "0" + str(i) if i < 10 else str(i)
            error_con[key] = 0
            error_html[key] = 0
    
        error_file = os.popen('ls ' + "./input").read().split()
        for i in range(0, len(error_file)):
            input = open('./input/' + error_file[i], 'r')
    
            for line in input:
                lines = line.split()
                error_msg = line[line.find("-", 50) + 2:]
                hour = lines[2][0:2]
    
                if error_msg.find("get html failed") > -1:
                    error_con[hour] += 1
                elif error_msg.find("parse detail html failed") > -1:
                    error_html[hour] += 1 / 2
    
        # 折线图
        plt.title("Plot of Error Hour Analyze(20190507-20190510)")
        plt.xlabel("Hour")
        plt.ylabel("Error Count")
    
        plt.plot(error_con.keys(), error_con.values(), color="r", linestyle="-", marker="^", linewidth=1, label="connect")
        plt.plot(error_html.keys(), error_html.values(), color="b", linestyle="-", marker="s", linewidth=1,
                 label="parse html")
        plt.legend(loc='upper left', bbox_to_anchor=(0.55, 0.95))
    
        plt.show()
    
    

    输出折线图

    • connect连接失败的次数明显多于parse解析失败,因为连接失败第一个页面就进不去了,也不存在后面的html解析
    • 在连接正常的情况下,解析失败的次数占少数,4天的日志汇总,最多在1个小时里出现2次
    • 2个折线图的走势基本一致,符合预期
    • 折线图出现3个高峰,分别在凌晨4点,早上8点,晚上9点,推测远程服务器可能会定期重启,后期考虑是否加上爬虫时间过滤,晚上不执行来削峰
    • 现在只有4天的日志,执行一段时间后收集长时间的日志,再观察是否和星期,天数,月份有关等
  • 相关阅读:
    开发者账号续期后,itunes停止付款了
    复杂sql分组查询 ( pivot)
    免费真机调试 -- Xcode7
    System.Security.Cryptography.CryptographicException: 指定了无效的提供程序类型
    phonegap创建的ios项目推送消息出现闪退现象
    这周的《财经郎眼
    <转>iOS应用程序内使用IAP/StoreKit付费、沙盒(SandBox)测试、创建测试账号流程!
    ios线程和GCD和队列同步异步的关系
    <转>关于Certificate、Provisioning Profile、App ID的介绍及其之间的关系
    为什么我们要做单元测试?(二)
  • 原文地址:https://www.cnblogs.com/wanli002/p/10850384.html
Copyright © 2020-2023  润新知