• 对代码的优化 对抗 对硬件台数的提高


    import xlrd
    import time
    import sys
    import os
    import requests
    import sqlite3
    import threading
    import math
    import re
    from time import sleep
    
    print('残差图')
    t = 3600 * 3
    sleep(t)
    
    curPath = os.path.abspath(os.path.dirname(__file__))
    rootPath = os.path.split(curPath)[0]
    sys.path.append(rootPath)
    
    MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST, next_day_tag = 1900, '天配额超限,限制访问', 'DB_KEY_EXHAUST', '000003'
    
    db = 'py_bdspider_status.db'
    db = '%s\%s' % (curPath, db)
    
    
    # def db_init_key_table():
    #     conn = sqlite3.connect(db)
    #     c = conn.cursor()
    #     sql = 'DELETE  FROM  baidu_map_key_used'
    #     c.execute(sql)
    #     conn.commit()
    #     pcity_file = '%s\%s' % (curPath, 'bdmap_key.txt')
    #     with open(pcity_file, 'r', encoding='utf-8') as pf:
    #         c_ = 0
    #         for i in pf:
    #             if len(i) < 4:
    #                 continue
    #             author, key = i.replace(' ', '').replace('
    ', '').replace('	', '').split(';')
    #             localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    #             sql = 'INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ' % (
    #                 author, key, localtime_, 0)
    #             c.execute(sql)
    #     conn.commit()
    #     conn.close()
    #     pf.close()
    
    # db_init_key_table()
    
    
    
    def db_recovery_bdkeynum():
        if time.strftime("%H%M%S", time.localtime()) == next_day_tag:
            conn = sqlite3.connect(db)
            c = conn.cursor()
            localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
            sql = 'UPDATE baidu_map_key_used SET today_used = 0 ,update_time=%s  ' % (localtime_)
            c.execute(sql)
            conn.commit()
            conn.close()
        return
    
    
    def db_get_one_effective():
        db_recovery_bdkeynum()
        conn = sqlite3.connect(db)
        c = conn.cursor()
        sql = 'SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC' % (MAX_USED_TIMES)
        res, r = c.execute(sql).fetchone(), ''
        if res is None:
            r = DB_KEY_EXHAUST
        else:
            r = res[0]
        conn.close()
        return r
    
    
    def db_update_one_today_used(key):
        conn = sqlite3.connect(db)
        c = conn.cursor()
        localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
        sql = 'UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ' % (
            localtime_, key)
        c.execute(sql)
        conn.commit()
        conn.close()
    
    
    dir_, dir_exception, requested_file_list = 'baidu_map_uid', 'baidu_map_uid_exception', []
    requested_file_dir_str, requested_file_dir_exception_str = '%s\%s\' % (curPath, dir_), '%s\%s\' % (
        curPath, dir_exception)
    requested_file_dir = os.listdir(requested_file_dir_str)
    
    
    def gen_requested_file_list(file_postfix='.html'):
        filepath = '%s\%s' % (curPath, dir_)
        pathDir = os.listdir(filepath)
        for allDir in pathDir:
            child = os.path.join('%s%s' % (filepath, allDir))
            requested_file = child.split(dir_)[1].split('&')[0].split(file_postfix)[0]
            if requested_file not in requested_file_list:
                requested_file_list.append(requested_file)
    
    
    file_postfix_l = ['.txt']
    for i in file_postfix_l:
        gen_requested_file_list(i)
    
    
    def gen_file_data(fname_source, file_type='.xlsx'):
        fname_open = '%s\%s' % (curPath, fname_source)
        excel_ = '%s%s' % (fname_open, file_type)
        book = xlrd.open_workbook(excel_, on_demand=True)
        sheet = book.sheet_by_index(0)
        data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
        book.release_resources()
        del book
        return data
    
    
    request_dic, target_type_list, target_type_except_list = {}, [], []
    
    fname_source = '【TEAM】采集员新增任务133598条-楼宇归集-互异百度uid数51700'
    
    data_selfadd = gen_file_data(fname_source)
    
    
    def replace_illeagl_tag(str_):
        l = [' ', '
    ', '	']
        for i in l:
            str_ = str_.replace(i, '')
        return str_
    
    
    # 碧海富通城三期(3栋) ok
    # =碧海富通城-三期(3栋) ok
    replace_to_empty_l = [' ', '|', '	', '
    ', '/', '?', '', '·', '.']
    
    
    def gen_bd_query_origin_name(name_):
        for i in replace_to_empty_l:
            name_ = name_.replace(i, '')
        return name_.replace('', '(').replace('', ')').replace('?', '').replace('', '')
    
    
    for l in data_selfadd:
        dbid, area_code, type_, city, district, uid, name_, address, street, request_name, submit_time = l
        # if city != '深圳市':
        #     continue
        # if len(uid.replace(' ', '')) > 0:
        #     continue
        request_name = gen_bd_query_origin_name(name_)
        request_name_chk = '%s%s%s' % (city, district, request_name)
        if request_name_chk in requested_file_list:
            continue
        if city not in request_dic:
            request_dic[city] = {}
        if district not in request_dic[city]:
            request_dic[city][district] = {}
            request_dic[city][district] = []
        if request_name not in request_dic[city][district]:
            request_dic[city][district].append(request_name)
    del data_selfadd
    
    fname_source = '【TEAM】41876条JMTool官方数据百度POIuid_添加率0.9388_170830171339'
    data_jmtool = gen_file_data(fname_source)
    for l in data_jmtool:
        dbid, area_code, name_, request_name, type_, city, district, addr, street, bd_status, bd_message, bd_res_str, city_bd, district_bd, business_bd, cityid_bd, name_bd, uid, lat_bd, lng_bd, compute_res, name_ratio_res, combine_ratio_res, uid_href = l
        # if len(uid.replace(' ', '')) > 0:
        #     continue
        # if city != '深圳市':
        #     continue
        request_name = gen_bd_query_origin_name(name_)
        request_name_chk = '%s%s%s' % (city, district, request_name)
        if request_name_chk in requested_file_list:
            continue
        if city not in request_dic:
            request_dic[city] = {}
        if district not in request_dic[city]:
            request_dic[city][district] = {}
            request_dic[city][district] = []
        if request_name not in request_dic[city][district]:
            request_dic[city][district].append(request_name)
    del data_jmtool
    
    write_res_file_dir = '%s\%s\' % (curPath, dir_)
    
    ex_l = ['Proxy Error', 'APP IP校验失败', 'APP不存在,AK有误请检查再重试', 'The requested URL could not be retrieved',
            'Address already in use']
    
    
    def write_res_file(input_, str_, dir_=write_res_file_dir, file_postfix='.txt'):
        for ex in ex_l:
            if str_.find(ex) > -1:
                global ak, url_
                print('EXCEPTION-', ex, 'AK-', ak, 'URL-', url_)
                return
        fname = '%s%s%s' % (dir_, input_, file_postfix)
        with open(fname, 'w', encoding='utf-8') as ft:
            ft.write(str_)
        ft.close()
        print('ok', threading.get_ident(), input_)
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name):
            threading.Thread.__init__(self)
            self.name, self.func, self.args = name, func, args
    
        def run(self):
            self.func(self.args)
    
    
    # http://api.map.baidu.com/place/v2/suggestion?query=瀛嘉天下&region=重庆市&city_limit=true&output=json&ak=oy2Q7IluhhwTGlz6l8pXYv6a0m6hXxr1
    base_url = 'http://api.map.baidu.com/place/v2/suggestion?query=R-QUERY&region=R-CITY&city_limit=true&output=json&ak=R-AK'
    
    
    def fun_(city):
        for district in request_dic[city]:
            for request_name in request_dic[city][district]:
                request_name_chk = '%s%s%s' % (city, district, request_name)
                # gen_requested_file_list('.txt')
                if request_name_chk in requested_file_list:
                    continue
                ak = db_get_one_effective()
                if ak == DB_KEY_EXHAUST:
                    print(DB_KEY_EXHAUST)
                    break
                else:
                    url_ = base_url.replace('R-QUERY', request_name).replace('R-CITY', city).replace('R-AK', ak)
                try:
                    bd_res_json_str = requests.get(url_).text
                    db_update_one_today_used(ak)
                    write_res_file(request_name_chk, bd_res_json_str)
                except Exception:
                    bd_res_json_str = '请求百度-异常'
                    write_res_file(request_name_chk, bd_res_json_str, requested_file_dir_exception_str)
                    print(request_name_chk, bd_res_json_str)
    
    
    try:
        start_loop, stop_loop = int(sys.argv[1]), int(sys.argv[2])
    except Exception:
        start_loop, stop_loop = -1, 200
    
    
    def main():
        threads_list, nloop = [], 0
        request_dic_city_l = sorted(request_dic, reverse=False)
        for city in request_dic_city_l:
            nloop += 1
            if nloop < start_loop or nloop > stop_loop:
                continue
            thread_instance = MyThread(fun_, (city), fun_.__name__)
            threads_list.append(thread_instance)
        for t in threads_list:
            t.setDaemon = False
            t.start()
        for t in threads_list:
            t.join()
    
    
    if __name__ == '__main__':
        main()

    python D:pyminecleanspider_mapget_uidapi_res.py 120 160

    python D:pyminecleanspider_mapget_uidapi_res.py 80 120


    python D:pyminecleanspider_mapget_uidapi_res.py 40 80

    python D:pyminecleanspider_mapget_uidapi_res.py 0 40

    import xlrd
    import time
    import sys
    import os
    import requests
    import sqlite3
    import threading
    import math
    curPath = os.path.abspath(os.path.dirname(__file__))
    rootPath = os.path.split(curPath)[0]
    sys.path.append(rootPath)
    
    MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST = 1900, '天配额超限,限制访问', 'DB_KEY_EXHAUST'
    
    db = 'py_bdspider_status.db'
    db = '%s\%s' % (curPath, db)
    
    
    # pcity_list = []
    # pcity_file = '%s\%s' % (curPath, '省会城市.txt')
    # with open(pcity_file, 'r', encoding='utf-8') as pf:
    #     c_ = 0
    #     for i in pf:
    #         c_ += 1
    #         if c_ == 3:
    #             c_ = 0
    #             pcity_list.append(i.replace(' ', '').replace('
    ', '') + '市')
    # pcity_sorted_list = sorted(pcity_list)
    #
    # target_city_list_big = ['广州市', '厦门市', '深圳市', '北京市', '杭州市', '成都市', '上海市', '西安市']
    # target_city_list_pass = target_city_list_big
    #
    # for i in pcity_list:
    #     if i not in target_city_list_big:
    #         target_city_list_pass.append(i)
    
    
    # def db_init_key_table():
    #     conn = sqlite3.connect(db)
    #     c = conn.cursor()
    #     sql = 'DELETE  FROM  baidu_map_key_used'
    #     c.execute(sql)
    #     conn.commit()
    #     pcity_file = '%s\%s' % (curPath, 'bdmap_key.txt')
    #     with open(pcity_file, 'r', encoding='utf-8') as pf:
    #         c_ = 0
    #         for i in pf:
    #             if len(i) < 4:
    #                 continue
    #             author, key = i.replace(' ', '').replace('
    ', '').replace('	', '').split(';')
    #             localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    #             sql = 'INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ' % (
    #                 author, key, localtime_, 0)
    #             c.execute(sql)
    #     conn.commit()
    #     conn.close()
    #     pf.close()
    
    # db_init_key_table()
    # target_city_list = target_city_list[0:11]
    # target_city_list = target_city_list[0:11]
    
    
    
    def db_get_one_effective():
        conn = sqlite3.connect(db)
        c = conn.cursor()
        sql = 'SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC' % (MAX_USED_TIMES)
    
        res = c.execute(sql).fetchone()
        if res is None:
            return DB_KEY_EXHAUST
        else:
            return res[0]
        conn.close
    
    
    def db_update_one_today_used(key):
        conn = sqlite3.connect(db)
        c = conn.cursor()
        localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
        sql = 'UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ' % (
            localtime_, key)
        c.execute(sql)
        conn.commit()
        conn.close()
    
    
    dir_, dir_exception, requested_file_list = 'baidu_map_uid_page', 'baidu_map_uid_page_exception', []
    requested_file_dir_str, requested_file_dir_exception_str = '%s\%s\' % (curPath, dir_), '%s\%s\' % (
        curPath, dir_exception)
    requested_file_dir = os.listdir(requested_file_dir_str)
    
    
    def gen_requested_file_list(file_postfix='.html'):
        filepath = '%s\%s' % (curPath, dir_)
        pathDir = os.listdir(filepath)
        for allDir in pathDir:
            child = os.path.join('%s%s' % (filepath, allDir))
            requested_file = child.split(dir_)[1].split('&')[0].split(file_postfix)[0]
            if requested_file not in requested_file_list:
                requested_file_list.append(requested_file)
    
    
    def gen_file_data(fname_source, file_type='.xlsx'):
        fname_open = '%s\%s' % (curPath, fname_source)
        excel_ = '%s%s' % (fname_open, file_type)
        book = xlrd.open_workbook(excel_, on_demand=True)
        sheet = book.sheet_by_index(0)
        data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
        book.release_resources()
        del book
        return data
    
    
    request_dic, target_type_list, target_type_except_list = {}, ['售楼处', '酒店', '专科医院', '家电', '家居建材', '咖啡馆'], ['住宅小区', '写字楼',
                                                                                                              '商场', '小学',
                                                                                                              '中学', '4S店',
                                                                                                              '汽车站', '火车站',
                                                                                                              '高铁站', '飞机场']
    file_postfix_l = ['.html', '.txt']
    for i in file_postfix_l:
        gen_requested_file_list(i)
    
    fname_source = '【TEAM】采集员新增任务133598条-楼宇归集-互异百度uid数51700'
    # fname_source = '【TEAM】采集员新增任务133598条-楼宇归集-互异百度uid数51700-is_building170901140053'
    data_selfadd = gen_file_data(fname_source)
    
    
    def replace_illeagl_tag(str_):
        l = [' ', '
    ', '	']
        for i in l:
            str_ = str_.replace(i, '')
        return str_
    
    
    # gen_requested_file_list()
    # gen_requested_file_list('.txt')
    
    for l in data_selfadd:
        id, area_code, type_, city, district, uid, name, address, street, name_reduction, submit_time = l
        # id, area_code, type_, city, district, uid, name, address, street, name_reduction, submit_time, is_building, name_, addr_ = l
    
        if len(uid.replace(' ', '')) < 6:
            continue
    
        # if type_ in target_type_except_list:
        #     continue
        # if len(uid.replace(' ', '')) < 6 or is_building == '0' or is_building == '1':
        #     continue
        city, district, uid = replace_illeagl_tag(city), replace_illeagl_tag(district), replace_illeagl_tag(uid)
        input_ = '%s%s%s' % (city, district, uid)
        if input_ in requested_file_list:
            print('requested', input_)
            continue
        if city not in request_dic:
            request_dic[city] = {}
        if district not in request_dic[city]:
            request_dic[city][district] = {}
            request_dic[city][district]['uid_list'] = []
            request_dic[city][district]['file_row_list'] = []
        if uid not in request_dic[city][district]['uid_list']:
            request_dic[city][district]['uid_list'].append(uid)
        request_dic[city][district]['file_row_list'].append(l)
    del data_selfadd
    
    fname_source = '【TEAM】41876条JMTool官方数据百度POIuid_添加率0.9388_170830171339'
    # fname_source = '【TEAM】41876条JMTool官方数据百度POIuid_添加率0.9388_170830171339-is_building170901140150'
    data_jmtool = gen_file_data(fname_source)
    for l in data_jmtool:
        dbid, area_code, name_, request_name, type_, city, district, addr, street, bd_status, bd_message, bd_res_str, city_bd, district_bd, business_bd, cityid_bd, name_bd, uid, lat_bd, lng_bd, compute_res, name_ratio_res, combine_ratio_res, uid_href = l
        # dbid, area_code, name_, request_name, type_, city, district, addr, street, bd_status, bd_message, bd_res_str, city_bd, district_bd, business_bd, cityid_bd, name_bd, uid, lat_bd, lng_bd, compute_res, name_ratio_res, combine_ratio_res, uid_href, is_building, name_, addr_ = l
    
        # if type_ in target_type_except_list:
        #     continue
        city, district, uid = replace_illeagl_tag(city), replace_illeagl_tag(district), replace_illeagl_tag(uid)
        if len(uid.replace(' ', '')) < 6:
            continue
        # if len(uid.replace(' ', '')) < 6 or is_building == '0' or is_building == '1':
        #     continue
        input_ = '%s%s%s' % (city, district, uid)
        if input_ in requested_file_list:
            print('requested', input_)
            continue
    
        if city not in request_dic:
            request_dic[city] = {}
        if district not in request_dic[city]:
            request_dic[city][district] = {}
            request_dic[city][district]['uid_list'] = []
            request_dic[city][district]['file_row_list'] = []
        if uid not in request_dic[city][district]['uid_list']:
            request_dic[city][district]['uid_list'].append(uid)
        request_dic[city][district]['file_row_list'].append(l)
    del data_jmtool
    
    write_res_file_dir = '%s\%s\' % (curPath, dir_)
    
    ex_l = ['Proxy Error', 'APP IP校验失败','APP不存在,AK有误请检查再重试','The requested URL could not be retrieved','Address already in use']
    
    
    def write_res_file(str_, input_, dir_=write_res_file_dir, file_postfix='.txt'):
        for ex in ex_l:
            if str_.find(ex) > -1:
                global ak,url_
                print('EXCEPTION-', ex,'AK-',ak,'URL-',url_)
    
                return
        fname = '%s%s%s' % (dir_, input_, file_postfix)
        with open(fname, 'w', encoding='utf-8') as ft:
            ft.write(str_)
        ft.close()
        print('ok', threading.get_ident(), input_)
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name):
            threading.Thread.__init__(self)
            self.name, self.func, self.args = name, func, args
    
        def run(self):
            self.func(self.args)
    
    
    requested_type_counter = 0
    base_url = 'http://api.map.baidu.com/place/v2/detail?uid=UID&output=json&scope=2&ak=AK'
    
    
    def fun_(city):
        for district in request_dic[city]:
            for uid in request_dic[city][district]['uid_list']:
                ak = db_get_one_effective()
                if ak == DB_KEY_EXHAUST:
                    print(DB_KEY_EXHAUST)
                    break
                else:
                    url_ = base_url.replace('UID', uid).replace('AK', ak)
                input_ = '%s%s%s' % (city, district, uid)
                try:
                    # gen_requested_file_list()
                    # gen_requested_file_list('.txt')
                    if input_ in requested_file_list:
                        continue
                    bd_res_json_str = requests.get(url_).text
                    db_update_one_today_used(ak)
                    write_res_file(bd_res_json_str, input_)
                except Exception:
                    bd_res_json_str = '请求百度-异常'
                    write_res_file(bd_res_json_str, input_, requested_file_dir_exception_str)
                    print(bd_res_json_str, input_)
    
    
    start_loop, stop_loop = int(sys.argv[1]), int(sys.argv[2])
    
    # city_num, start_loop, stop_loop = len(request_dic), 0, 40
    # thread_max = city_num
    
    
    def main():
        threads_list, nloop = [], 0
        request_dic_city_l = sorted(request_dic, reverse=False)
        for city in request_dic_city_l:
            nloop += 1
            if nloop < start_loop or nloop > stop_loop:
                continue
            thread_instance = MyThread(fun_, (city), fun_.__name__)
            threads_list.append(thread_instance)
        for t in threads_list:
            t.setDaemon = False
            t.start()
        for t in threads_list:
            t.join()
    
    
    if __name__ == '__main__':
        main()
    

      

    import xlrd
    import time
    import sys
    import os
    import requests
    import sqlite3
    import threading
    import math
    curPath = os.path.abspath(os.path.dirname(__file__))
    rootPath = os.path.split(curPath)[0]
    sys.path.append(rootPath)
    
    MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST = 1700, '天配额超限,限制访问', 'DB_KEY_EXHAUST'
    
    db = 'py_bdspider_status.db'
    db = '%s\%s' % (curPath, db)
    
    
    # pcity_list = []
    # pcity_file = '%s\%s' % (curPath, '省会城市.txt')
    # with open(pcity_file, 'r', encoding='utf-8') as pf:
    #     c_ = 0
    #     for i in pf:
    #         c_ += 1
    #         if c_ == 3:
    #             c_ = 0
    #             pcity_list.append(i.replace(' ', '').replace('
    ', '') + '市')
    # pcity_sorted_list = sorted(pcity_list)
    #
    # target_city_list_big = ['广州市', '厦门市', '深圳市', '北京市', '杭州市', '成都市', '上海市', '西安市']
    # target_city_list_pass = target_city_list_big
    #
    # for i in pcity_list:
    #     if i not in target_city_list_big:
    #         target_city_list_pass.append(i)
    
    
    # def db_init_key_table():
    #     conn = sqlite3.connect(db)
    #     c = conn.cursor()
    #     sql = 'DELETE  FROM  baidu_map_key_used'
    #     c.execute(sql)
    #     conn.commit()
    #     pcity_file = '%s\%s' % (curPath, 'bdmap_key.txt')
    #     with open(pcity_file, 'r', encoding='utf-8') as pf:
    #         c_ = 0
    #         for i in pf:
    #             if len(i) < 4:
    #                 continue
    #             author, key = i.replace(' ', '').replace('
    ', '').replace('	', '').split(';')
    #             localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    #             sql = 'INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ' % (
    #                 author, key, localtime_, 0)
    #             c.execute(sql)
    #     conn.commit()
    #     conn.close()
    #     pf.close()
    
    # db_init_key_table()
    # target_city_list = target_city_list[0:11]
    # target_city_list = target_city_list[0:11]
    
    
    
    def db_get_one_effective():
        conn = sqlite3.connect(db)
        c = conn.cursor()
        sql = 'SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC' % (MAX_USED_TIMES)
    
        res = c.execute(sql).fetchone()
        if res is None:
            return DB_KEY_EXHAUST
        else:
            return res[0]
        conn.close
    
    
    def db_update_one_today_used(key):
        conn = sqlite3.connect(db)
        c = conn.cursor()
        localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
        sql = 'UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ' % (
            localtime_, key)
        c.execute(sql)
        conn.commit()
        conn.close()
    
    
    dir_, dir_exception, requested_file_list = 'baidu_map_uid_page', 'baidu_map_uid_page_exception', []
    requested_file_dir_str, requested_file_dir_exception_str = '%s\%s\' % (curPath, dir_), '%s\%s\' % (
        curPath, dir_exception)
    requested_file_dir = os.listdir(requested_file_dir_str)
    
    
    def gen_requested_file_list(file_postfix='.html'):
        filepath = '%s\%s' % (curPath, dir_)
        pathDir = os.listdir(filepath)
        for allDir in pathDir:
            child = os.path.join('%s%s' % (filepath, allDir))
            requested_file = child.split(dir_)[1].split('&')[0].split(file_postfix)[0]
            if requested_file not in requested_file_list:
                requested_file_list.append(requested_file)
    
    
    def gen_file_data(fname_source, file_type='.xlsx'):
        fname_open = '%s\%s' % (curPath, fname_source)
        excel_ = '%s%s' % (fname_open, file_type)
        book = xlrd.open_workbook(excel_, on_demand=True)
        sheet = book.sheet_by_index(0)
        data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
        book.release_resources()
        del book
        return data
    
    
    request_dic, target_type_list, target_type_except_list = {}, ['售楼处', '酒店', '专科医院', '家电', '家居建材', '咖啡馆'], ['住宅小区', '写字楼',
                                                                                                              '商场', '小学',
                                                                                                              '中学', '4S店',
                                                                                                              '汽车站', '火车站',
                                                                                                              '高铁站', '飞机场']
    file_postfix_l = ['.html', '.txt']
    for i in file_postfix_l:
        gen_requested_file_list(i)
    
    fname_source = '【TEAM】采集员新增任务133598条-楼宇归集-互异百度uid数51700'
    # fname_source = '【TEAM】采集员新增任务133598条-楼宇归集-互异百度uid数51700-is_building170901140053'
    data_selfadd = gen_file_data(fname_source)
    
    
    def replace_illeagl_tag(str_):
        l = [' ', '
    ', '	']
        for i in l:
            str_ = str_.replace(i, '')
        return str_
    
    
    # gen_requested_file_list()
    # gen_requested_file_list('.txt')
    
    for l in data_selfadd:
        id, area_code, type_, city, district, uid, name, address, street, name_reduction, submit_time = l
        # id, area_code, type_, city, district, uid, name, address, street, name_reduction, submit_time, is_building, name_, addr_ = l
    
        if len(uid.replace(' ', '')) < 6:
            continue
    
        # if type_ in target_type_except_list:
        #     continue
        # if len(uid.replace(' ', '')) < 6 or is_building == '0' or is_building == '1':
        #     continue
        city, district, uid = replace_illeagl_tag(city), replace_illeagl_tag(district), replace_illeagl_tag(uid)
        input_ = '%s%s%s' % (city, district, uid)
        if input_ in requested_file_list:
            print('requested', input_)
            continue
        if city not in request_dic:
            request_dic[city] = {}
        if district not in request_dic[city]:
            request_dic[city][district] = {}
            request_dic[city][district]['uid_list'] = []
            request_dic[city][district]['file_row_list'] = []
        if uid not in request_dic[city][district]['uid_list']:
            request_dic[city][district]['uid_list'].append(uid)
        request_dic[city][district]['file_row_list'].append(l)
    del data_selfadd
    
    fname_source = '【TEAM】41876条JMTool官方数据百度POIuid_添加率0.9388_170830171339'
    # fname_source = '【TEAM】41876条JMTool官方数据百度POIuid_添加率0.9388_170830171339-is_building170901140150'
    data_jmtool = gen_file_data(fname_source)
    for l in data_jmtool:
        dbid, area_code, name_, request_name, type_, city, district, addr, street, bd_status, bd_message, bd_res_str, city_bd, district_bd, business_bd, cityid_bd, name_bd, uid, lat_bd, lng_bd, compute_res, name_ratio_res, combine_ratio_res, uid_href = l
        # dbid, area_code, name_, request_name, type_, city, district, addr, street, bd_status, bd_message, bd_res_str, city_bd, district_bd, business_bd, cityid_bd, name_bd, uid, lat_bd, lng_bd, compute_res, name_ratio_res, combine_ratio_res, uid_href, is_building, name_, addr_ = l
    
        # if type_ in target_type_except_list:
        #     continue
        city, district, uid = replace_illeagl_tag(city), replace_illeagl_tag(district), replace_illeagl_tag(uid)
        if len(uid.replace(' ', '')) < 6:
            continue
        # if len(uid.replace(' ', '')) < 6 or is_building == '0' or is_building == '1':
        #     continue
        input_ = '%s%s%s' % (city, district, uid)
        if input_ in requested_file_list:
            print('requested', input_)
            continue
    
        if city not in request_dic:
            request_dic[city] = {}
        if district not in request_dic[city]:
            request_dic[city][district] = {}
            request_dic[city][district]['uid_list'] = []
            request_dic[city][district]['file_row_list'] = []
        if uid not in request_dic[city][district]['uid_list']:
            request_dic[city][district]['uid_list'].append(uid)
        request_dic[city][district]['file_row_list'].append(l)
    del data_jmtool
    
    write_res_file_dir = '%s\%s\' % (curPath, dir_)
    
    ex_l = ['Proxy Error', 'APP IP校验失败','APP不存在,AK有误请检查再重试','The requested URL could not be retrieved']
    
    
    def write_res_file(str_, input_, dir_=write_res_file_dir, file_postfix='.txt'):
        for ex in ex_l:
            if str_.find(ex) > -1:
                global ak,url_
                print('EXCEPTION-', ex,'AK-',ak,'URL-',url_)
    
                return
        fname = '%s%s%s' % (dir_, input_, file_postfix)
        with open(fname, 'w', encoding='utf-8') as ft:
            ft.write(str_)
        ft.close()
        print('ok', threading.get_ident(), input_)
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name):
            threading.Thread.__init__(self)
            self.name, self.func, self.args = name, func, args
    
        def run(self):
            self.func(self.args)
    
    
    requested_type_counter = 0
    base_url = 'http://api.map.baidu.com/place/v2/detail?uid=UID&output=json&scope=2&ak=AK'
    
    
    def fun_(city):
        for district in request_dic[city]:
            for uid in request_dic[city][district]['uid_list']:
                ak = db_get_one_effective()
                if ak == DB_KEY_EXHAUST:
                    print(DB_KEY_EXHAUST)
                    break
                else:
                    url_ = base_url.replace('UID', uid).replace('AK', ak)
                input_ = '%s%s%s' % (city, district, uid)
                try:
                    # gen_requested_file_list()
                    # gen_requested_file_list('.txt')
                    if input_ in requested_file_list:
                        continue
                    bd_res_json_str = requests.get(url_).text
                    db_update_one_today_used(ak)
                    write_res_file(bd_res_json_str, input_)
                except Exception:
                    bd_res_json_str = '请求百度-异常'
                    write_res_file(bd_res_json_str, input_, requested_file_dir_exception_str)
                    print(bd_res_json_str, input_)
    
    
    start_loop, stop_loop = int(sys.argv[1]), int(sys.argv[2])
    
    # city_num, start_loop, stop_loop = len(request_dic), 0, 40
    # thread_max = city_num
    
    
    def main():
        threads_list, nloop = [], 0
        request_dic_city_l = sorted(request_dic, reverse=False)
        for city in request_dic_city_l:
            nloop += 1
            if nloop < start_loop or nloop > stop_loop:
                continue
            thread_instance = MyThread(fun_, (city), fun_.__name__)
            threads_list.append(thread_instance)
        for t in threads_list:
            t.setDaemon = False
            t.start()
        for t in threads_list:
            t.join()
    
    
    if __name__ == '__main__':
        main()
    

      

    import xlrd
    import time
    import sys
    import os
    import requests
    import sqlite3
    import threading
    
    curPath = os.path.abspath(os.path.dirname(__file__))
    rootPath = os.path.split(curPath)[0]
    sys.path.append(rootPath)
    
    MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST = 1700, '天配额超限,限制访问', 'DB_KEY_EXHAUST'
    
    db = 'py_bdspider_status.db'
    db = '%s\%s' % (curPath, db)
    
    
    # pcity_list = []
    # pcity_file = '%s\%s' % (curPath, '省会城市.txt')
    # with open(pcity_file, 'r', encoding='utf-8') as pf:
    #     c_ = 0
    #     for i in pf:
    #         c_ += 1
    #         if c_ == 3:
    #             c_ = 0
    #             pcity_list.append(i.replace(' ', '').replace('
    ', '') + '市')
    # pcity_sorted_list = sorted(pcity_list)
    #
    # target_city_list_big = ['广州市', '厦门市', '深圳市', '北京市', '杭州市', '成都市', '上海市', '西安市']
    # target_city_list_pass = target_city_list_big
    #
    # for i in pcity_list:
    #     if i not in target_city_list_big:
    #         target_city_list_pass.append(i)
    
    
    # def db_init_key_table():
    #     conn = sqlite3.connect(db)
    #     c = conn.cursor()
    #     sql = 'DELETE  FROM  baidu_map_key_used'
    #     c.execute(sql)
    #     conn.commit()
    #     pcity_file = '%s\%s' % (curPath, 'bdmap_key.txt')
    #     with open(pcity_file, 'r', encoding='utf-8') as pf:
    #         c_ = 0
    #         for i in pf:
    #             if len(i) < 4:
    #                 continue
    #             author, key = i.replace(' ', '').replace('
    ', '').replace('	', '').split(';')
    #             localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    #             sql = 'INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ' % (
    #                 author, key, localtime_, 0)
    #             c.execute(sql)
    #     conn.commit()
    #     conn.close()
    #     pf.close()
    
    # db_init_key_table()
    # target_city_list = target_city_list[0:11]
    # target_city_list = target_city_list[0:11]
    
    
    
    def db_get_one_effective():
        conn = sqlite3.connect(db)
        c = conn.cursor()
        sql = 'SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC' % (MAX_USED_TIMES)
    
        res = c.execute(sql).fetchone()
        if res is None:
            return DB_KEY_EXHAUST
        else:
            return res[0]
        conn.close
    
    
    def db_update_one_today_used(key):
        conn = sqlite3.connect(db)
        c = conn.cursor()
        localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
        sql = 'UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ' % (
            localtime_, key)
        c.execute(sql)
        conn.commit()
        conn.close()
    
    
    dir_, dir_exception, requested_file_list = 'baidu_map_uid_page', 'baidu_map_uid_page_exception', []
    requested_file_dir_str, requested_file_dir_exception_str = '%s\%s\' % (curPath, dir_), '%s\%s\' % (
        curPath, dir_exception)
    requested_file_dir = os.listdir(requested_file_dir_str)
    
    
    def gen_requested_file_list(file_postfix='.html'):
        filepath = '%s\%s' % (curPath, dir_)
        pathDir = os.listdir(filepath)
        for allDir in pathDir:
            child = os.path.join('%s%s' % (filepath, allDir))
            requested_file = child.split(dir_)[1].split('&')[0].split(file_postfix)[0]
            if requested_file not in requested_file_list:
                requested_file_list.append(requested_file)
    
    
    def gen_file_data(fname_source, file_type='.xlsx'):
        fname_open = '%s\%s' % (curPath, fname_source)
        excel_ = '%s%s' % (fname_open, file_type)
        book = xlrd.open_workbook(excel_, on_demand=True)
        sheet = book.sheet_by_index(0)
        data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
        book.release_resources()
        del book
        return data
    
    
    request_dic, target_type_list, target_type_except_list = {}, ['售楼处', '酒店', '专科医院', '家电', '家居建材', '咖啡馆'], ['住宅小区', '写字楼',
                                                                                                              '商场', '小学',
                                                                                                              '中学', '4S店',
                                                                                                              '汽车站', '火车站',
                                                                                                              '高铁站', '飞机场']
    file_postfix_l = ['.html', '.txt']
    for i in file_postfix_l:
        gen_requested_file_list(i)
    
    fname_source = '【TEAM】采集员新增任务133598条-楼宇归集-互异百度uid数51700'
    # fname_source = '【TEAM】采集员新增任务133598条-楼宇归集-互异百度uid数51700-is_building170901140053'
    data_selfadd = gen_file_data(fname_source)
    
    
    def replace_illeagl_tag(str_):
        l = [' ', '
    ', '	']
        for i in l:
            str_ = str_.replace(i, '')
        return str_
    
    
    # gen_requested_file_list()
    # gen_requested_file_list('.txt')
    
    for l in data_selfadd:
        id, area_code, type_, city, district, uid, name, address, street, name_reduction, submit_time = l
        # id, area_code, type_, city, district, uid, name, address, street, name_reduction, submit_time, is_building, name_, addr_ = l
    
        if len(uid.replace(' ', '')) < 6:
            continue
    
        # if type_ in target_type_except_list:
        #     continue
        # if len(uid.replace(' ', '')) < 6 or is_building == '0' or is_building == '1':
        #     continue
        city, district, uid = replace_illeagl_tag(city), replace_illeagl_tag(district), replace_illeagl_tag(uid)
        input_ = '%s%s%s' % (city, district, uid)
        if input_ in requested_file_list:
            print('requested', input_)
            continue
        if city not in request_dic:
            request_dic[city] = {}
        if district not in request_dic[city]:
            request_dic[city][district] = {}
            request_dic[city][district]['uid_list'] = []
            request_dic[city][district]['file_row_list'] = []
        if uid not in request_dic[city][district]['uid_list']:
            request_dic[city][district]['uid_list'].append(uid)
        request_dic[city][district]['file_row_list'].append(l)
    del data_selfadd
    
    fname_source = '【TEAM】41876条JMTool官方数据百度POIuid_添加率0.9388_170830171339'
    # fname_source = '【TEAM】41876条JMTool官方数据百度POIuid_添加率0.9388_170830171339-is_building170901140150'
    data_jmtool = gen_file_data(fname_source)
    for l in data_jmtool:
        dbid, area_code, name_, request_name, type_, city, district, addr, street, bd_status, bd_message, bd_res_str, city_bd, district_bd, business_bd, cityid_bd, name_bd, uid, lat_bd, lng_bd, compute_res, name_ratio_res, combine_ratio_res, uid_href = l
        # dbid, area_code, name_, request_name, type_, city, district, addr, street, bd_status, bd_message, bd_res_str, city_bd, district_bd, business_bd, cityid_bd, name_bd, uid, lat_bd, lng_bd, compute_res, name_ratio_res, combine_ratio_res, uid_href, is_building, name_, addr_ = l
    
        # if type_ in target_type_except_list:
        #     continue
        city, district, uid = replace_illeagl_tag(city), replace_illeagl_tag(district), replace_illeagl_tag(uid)
        if len(uid.replace(' ', '')) < 6:
            continue
        # if len(uid.replace(' ', '')) < 6 or is_building == '0' or is_building == '1':
        #     continue
        input_ = '%s%s%s' % (city, district, uid)
        if input_ in requested_file_list:
            print('requested', input_)
            continue
    
        if city not in request_dic:
            request_dic[city] = {}
        if district not in request_dic[city]:
            request_dic[city][district] = {}
            request_dic[city][district]['uid_list'] = []
            request_dic[city][district]['file_row_list'] = []
        if uid not in request_dic[city][district]['uid_list']:
            request_dic[city][district]['uid_list'].append(uid)
        request_dic[city][district]['file_row_list'].append(l)
    del data_jmtool
    
    write_res_file_dir = '%s\%s\' % (curPath, dir_)
    
    ex_l = ['Proxy Error', 'APP IP校验失败']
    
    
    def write_res_file(str_, input_, dir_=write_res_file_dir, file_postfix='.txt'):
        for ex in ex_l:
            if str_.find(ex) > -1:
                print('EXCEPTION-', ex)
                return
        fname = '%s%s%s' % (dir_, input_, file_postfix)
        with open(fname, 'w', encoding='utf-8') as ft:
            ft.write(str_)
        ft.close()
        print('ok', threading.get_ident(), input_)
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name):
            threading.Thread.__init__(self)
            self.name, self.func, self.args = name, func, args
    
        def run(self):
            self.func(self.args)
    
    
    requested_type_counter = 0
    base_url = 'http://api.map.baidu.com/place/v2/detail?uid=UID&output=json&scope=2&ak=AK'
    
    
    def fun_(city):
        for district in request_dic[city]:
            for uid in request_dic[city][district]['uid_list']:
                ak = db_get_one_effective()
                if ak == DB_KEY_EXHAUST:
                    print(DB_KEY_EXHAUST)
                    break
                else:
                    url_ = base_url.replace('UID', uid).replace('AK', ak)
                input_ = '%s%s%s' % (city, district, uid)
                try:
                    # gen_requested_file_list()
                    # gen_requested_file_list('.txt')
                    if input_ in requested_file_list:
                        continue
                    bd_res_json_str = requests.get(url_).text
                    db_update_one_today_used(ak)
                    write_res_file(bd_res_json_str, input_)
                except Exception:
                    bd_res_json_str = '请求百度-异常'
                    write_res_file(bd_res_json_str, input_, requested_file_dir_exception_str)
                    print(bd_res_json_str, input_)
    
    
    city_num, start_loop, stop_loop = len(request_dic), 0, 100
    thread_max = city_num
    
    
    def main():
        threads_list, nloop = [], 0
        request_dic_city_l = sorted(request_dic, reverse=False)
        for city in request_dic_city_l:
            nloop += 1
            if nloop < start_loop or nloop > stop_loop:
                continue
            thread_instance = MyThread(fun_, (city), fun_.__name__)
            threads_list.append(thread_instance)
        for t in threads_list:
            t.setDaemon = False
            t.start()
        for t in threads_list:
            t.join()
    
    
    if __name__ == '__main__':
        main()
    

      

  • 相关阅读:
    i18n在4种常见环境下使用的方式
    vue-style里面设置变量
    扫码登录功能如何实现?一文搞懂主流的扫码登录技术原理
    Gradle编译Spring源码
    Spring学习总结(7)-AOP
    Docker安装Redis
    Jvm相关文章
    上传项目到Github
    Win10系统安装MySQL Workbench 8
    [转]HashMap 和 currentHashMap 总结
  • 原文地址:https://www.cnblogs.com/rsapaper/p/7470051.html
Copyright © 2020-2023  润新知