• 2020学习05 爬虫,修改了一些bug


    在上个爬虫代码中没有对信件类型进行分类,而且爬取的数据会出现大片时间爬取不到和回复内容爬取不到,

    对代码进行优化后,

    得到如下数据:

     只显示部分数据,可以看到爬取的完整度基本完好。

    代码如下:

    #coding:utf-8
    import requests
    from lxml import etree
    import time
    import pymysql
    import datetime
    import urllib
    import json
    from IPython.core.page import page
    
    conn = pymysql.connect(
            host="localhost",
            user="root",
            port=3306,
            password="123456",
            database="bjxj")
    gg=2950
    
    def db(conn, reqcontent,reqname,reqtime,resname,restime,rescontent,reqtype,isreply):
        cursor = conn.cursor()
        # cursor.execute(
        #     "INSERT INTO xinjian(name) VALUES (%s)",
        #     [name])
        if isreply == False :
            isreply = 0
            restime1 = ''
        else :
            isreply = 1
            restime1 = restime
        # print(reqcontent)
        # print(reqname)
        # print(reqtime)
        # print(resname)
        # #print(restime)
        # print(rescontent)
        # print(reqtype)
        # print(isreply)
        cursor.execute("INSERT INTO aaa (reqcontent,reqname,reqtime,resname,rescontent,reqtype,isreply,restime) VALUES (%s,%s,%s,%s,%s,%s,%s,%s);", [reqcontent,reqname,reqtime,resname,rescontent,reqtype,isreply,restime1])
        conn.commit()
        cursor.close()
    
    def shijinOU(json1,url,i):
        print(i)
        head = {
                'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
                'method': 'POST',
                'Content-Type': 'application/json;charset=UTF-8',
    
                }
        data_json = json.dumps(json1)
        r = requests.post(url,data = data_json,headers=head)
        html = r.content.decode("utf-8")
        print("Status code:",r.status_code)
        new_data = json.loads(html)
        #print("6666:" + html)
        for s in range(0,6):
            print(new_data['mailList'][s])
            reqname = new_data['mailList'][s]['letter_title']
            reqtime = new_data['mailList'][s]['create_date']
            resname = new_data['mailList'][s]['org_id']
            isreply = new_data['mailList'][s]['isReply']
            reqtype = new_data['mailList'][s]['letter_type']
            if new_data['mailList'][s]['letter_type'] == '咨询' :
                #print(isreply)
                #print("询问标题:" + reqname + "询问时间:" + reqtime + "回答部门:" + resname + "是否回答:")
                lettertype = 'consult'
                lettertype1 = 'consultDetail'
                zixunTiqu(new_data['mailList'][s]['original_id'],reqname,reqtime,resname,isreply,reqtype,lettertype,lettertype1)
    
            if new_data['mailList'][s]['letter_type'] == '建议' :
                lettertype = 'suggest'
                lettertype1 = 'suggesDetail'
                zixunTiqu(new_data['mailList'][s]['original_id'], reqname, reqtime, resname, isreply, reqtype, lettertype,
                          lettertype1)
            if new_data['mailList'][s]['letter_type'] == '投诉' :
                lettertype = 'complain'
                lettertype1 = 'complainDetail'
                zixunTiqu(new_data['mailList'][s]['original_id'], reqname, reqtime, resname, isreply, reqtype, lettertype,
                          lettertype1)
    
    def zixunTiqu(AH,reqname,reqtime,resname,isreply,reqtype,lettertype,lettertype1):
        #print("询问标题:"+reqname+"询问时间:"+reqtime+"回答部门:"+resname+"是否回答:"+isreply)
        head = {
                'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
                'method': 'GET'
            }
        url2 = 'http://www.beijing.gov.cn/hudong/hdjl/com.web.'+lettertype+'.'+lettertype1+'.flow?originalId='+AH
    
        r = requests.get(url2, headers=head)
        #print(r.status_code)
    
        html = r.content.decode("utf-8")
        #print("777"+html)
        html1 = etree.HTML(html)
        #print(html)
        reqcontent1 = html1.xpath('head/meta[@name="Description"]/@content')
    
        restime1 = html1.xpath('//div[@class="col-xs-12 col-sm-3 col-md-3 my-2 "]//text()')
        restime2 = html1.xpath('//div[@class="col-xs-12 col-sm-3 col-md-3 my-2"]//text()')
        print(restime1)
        restime = ''
        rescontent = ''
        if len(restime1) ==0 and len(restime2) ==0:
            print("未回答")
            restime = ''
            rescontent = ''
        else:
            if len(restime1) == 0:
    
                restime = restime2[0]
                rescontent1 = html1.xpath('string(//div[@class="col-xs-12 col-md-12 column p-4 text-muted my-3"])')
                rescontent = rescontent1.strip()
            else:
                restime = restime1[0]
                rescontent1 = html1.xpath('string(//div[@class="col-xs-12 col-md-12 column p-4 text-muted my-3"])')
                rescontent = rescontent1.strip()
    
    
        #print(restime)
        print(rescontent)
        db(conn, reqcontent1[0], reqname, reqtime, resname, restime, rescontent, reqtype, isreply)
    
    if __name__=='__main__':
        for i in range(0,100):
            print('***************************************************')
            page = 6*i
    
            fuck = {"PageCond/begin":page,
                    "PageCond/length":6,
                    "PageCond/isCount":"true",
                    "keywords":"","orgids":"",
                    "startDate":"","endDate":"",
                    "letterType":"","letterStatue":""
                    }
            shijinOU(fuck,"http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.mailList.biz.ext",i)
            #break
            #print(fuck)
    

      

    html1 = etree.HTML(html)

    总结:对于页面元素内容的提取可以通过html =  etree.html(html1) 将html元素转换成可以使用xpath解析定位的内容,进而通过xpath解析定位得到元素的值。

    
    
  • 相关阅读:
    给大家介绍几个网站学习前端和服务器语言的网站吧,一定有合适你的
    centos用yum安装软件提示:另外一个程序锁定了 yum;等待它退出
    模仿小猎CMS首页功能展示的JS效果
    在centos下安装Node.js 开发环境搭建
    discuz在IIS,apache中分别怎么设置伪静态
    CentOS系统下各文件夹的作用
    centos上网设置
    php微信公众平台开发获取access_token,用CURL出现certificate verify failed错误的解决方法
    12.9 NIO
    12.8 Java 9改进的对象序列化
  • 原文地址:https://www.cnblogs.com/xcl666/p/12267474.html
Copyright © 2020-2023  润新知