• 爬取江苏省预算公开 文件下载 【JS页面爬虫】


    import re, requests, json, os, time
    from io import BytesIO
     
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
        }
     
    #获取所有部门
    def getDepartment(pid , startNum):
        url = "http://yjsgk.jsczt.cn/front/department/getdepartmentname.do?pid=" + str(pid)
        res = requests.get(url, headers = headers)
        res = res.content
        json_res = json.loads(res)
        for i in range(len(json_res)):
            if i < startNum:
                continue
            data = json_res[i]
            #部门id
            groupid = data["iid"]
            #部门名称
            departmentName = data["name"]
            tempPath = "./" + departmentName + "/"
            print("编号:" + str(i), "部门:" + departmentName)
            #尝试获取资料
            try:
                getTry(groupid, tempPath)
            except:
            #二次尝试
                try:
                    getTry(groupid, tempPath)
                except:
                    print("获取失败", "编号:" + str(i), "部门:" + departmentName)
                 
    #尝试获取资料
    def getTry(groupid, path):
        #部门预决算公开管理文件
        # getbmglwj(groupid, path, 1)
        #部门预算
        getbudgetfinal(groupid, path, 2)
        #部门决算
        # getbudgetfinal(groupid, path, 3)
        #部门专项资金
        # getbmcontent(groupid, path, 4)
        #部门资产信息
        # getbmcontent(groupid, path, 5)
        #保命
        time.sleep(2)
     
    #部门预决算公开管理文件
    def getbmglwj(groupID, path, typeid):
        path = path + "部门预决算公开管理文件" + "/"
        createDir(path)
        #http://yjsgk.jsczt.cn/front/bmglwj/bmgkgkwj_page.do?page_num=1&groupid=4972&typeid=1
        url = "http://yjsgk.jsczt.cn/front/bmglwj/bmgkgkwj_page.do?page_num=1&groupid=" + str(groupID) + "&typeid=" + str(typeid)
        res = requests.get(url, headers = headers)
        res = res.content
        json_res = json.loads(res)
        for data in json_res["depPublicServices"]:
            file_uuid = data["file_uuid"]
            file_name = data["g_title"] + ".pdf"
            print("----" + file_name)
            file_url = "http://yjsgk.jsczt.cn/front/glwj/download.do?uuid=" + file_uuid
            filepath = path + file_name
            download(file_url, filepath)
     
             
    #部门预决算公开
    def getbudgetfinal(groupID, path , typeid):
        #http://yjsgk.jsczt.cn/front/budgetfinal/itemsandpag.do?page_num=1&groupid=4972&typeid=2
        #http://yjsgk.jsczt.cn/front/budgetfinal/itemsandpag.do?page_num=1&groupid=4972&typeid=3
        if typeid == 2:
            path = path + "部门预算公开" + "/"
        if typeid == 3:
            path = path + "部门决算公开" + "/"
        createDir(path)
        url = "http://yjsgk.jsczt.cn/front/budgetfinal/itemsandpag.do?page_num=1&groupid=" + str(groupID) + "&typeid=" + str(typeid)
        res = requests.get(url, headers = headers)
        res = res.content
        json_res = json.loads(res)
        i = 0
        for data in json_res["budgetTemplates"]:
            uuid = data["uuid"]
            name = data["b_title"]
            print("----" + name)
            url = "http://yjsgk.jsczt.cn/front/budgetfinal/getTemporaryFiles.do?uuid=" + uuid
            tempPath = path + name + "/"
            getbudgetUrl(url, tempPath)
            if i == 1:
                break
            i = i + 1
             
    def getbudgetUrl(url, path):
        createDir(path)
        res = requests.get(url, headers = headers)
        res = res.content
        json_res = json.loads(res)
        for data in json_res:
            iid = data["iid"]
            file_name = data["t_oldname"]
            file_url = "http://yjsgk.jsczt.cn/front/budgetfinal/download.do?iid=" + str(iid)
            filepath = path + file_name
            download(file_url, filepath)
     
    #部门专项资金公开
    def getbmcontent(groupID, path, typeid):
        #http://yjsgk.jsczt.cn/front/bmcontent/bmgcontent_page.do?page_num=1&groupid=4972&typeid=4
        if typeid == 4:
            path = path + "部门专项资金公开" + "/"
        if typeid == 5:
            path = path + "部门资产信息公开"  + "/"
        createDir(path)
        url = "http://yjsgk.jsczt.cn/front/bmcontent/bmgcontent_page.do?page_num=1&groupid=" + str(groupID) + "&typeid=" + str(typeid)
        res = requests.get(url, headers)
        res = res.content
        json_res = json.loads(res)
        for data in json_res["departmentContent"]:
            subjectiid = data["iid"]
            file_name = data["d_title"]
            print("----" + file_name)
            file_content = data["d_content"]
            #http://yjsgk.jsczt.cn/front/bmcontent/file_list.do?subjectiid=45810
            url = "http://yjsgk.jsczt.cn/front/bmcontent/file_list.do?subjectiid=" + str(subjectiid)
            res = requests.get(url, headers = headers)
            res = res.content
            if len(res) == 2:
                #无附件
                file_content = file_content.replace("</p>", "
    ")
                file_content = file_content.replace("&nbsp;", " ")
                file_content = re.sub("<.+?>","", file_content)
                filepath = path + file_name + ".txt"
                with open(filepath, "w")as f:
                    f.write(file_content)
            else:
                #有附件
                json_res = json.loads(res)
                for j_data in json_res:
                    file_iid = j_data["iid"]
                    file_name = j_data["file_oldname"]
                    filepath = path + file_name
                    #http://yjsgk.jsczt.cn/front/bmcontent/download.do?iid=4228
                    file_url = "http://yjsgk.jsczt.cn/front/bmcontent/download.do?iid=" + str(file_iid)
                    download(file_url, filepath)
     
    #下载
    def download(url, filepath):
        res = requests.get(url, headers = headers)
        data = res.content
        with open(filepath, "wb") as f:
            f.write(data)
         
        time.sleep(1)#保命....
     
    #创建目录
    def createDir(path):
        if not os.path.exists(path):
            os.makedirs(path)
     
    pid = 122
    #编号,0为从头开始下载
    startNum = 0
     
    getDepartment(pid, startNum)
    

    获取json页面

    帮人弄的,正好学了一下,如果爬取JS页面

  • 相关阅读:
    shell 案例
    linux 软链接和硬链接区别
    mac安装使用nginx
    Leetcode SQL_#176_第二高的薪水
    南邮ctf web题记录(上)
    CTFHub Web技能树
    XCTF web 新手练习区
    诊断工具--arthas使用教程
    prometheus--监控工具
    无状态状态机--cola stateMachine
  • 原文地址:https://www.cnblogs.com/douzujun/p/13149482.html
Copyright © 2020-2023  润新知