• thread_process_action


    import math
    import random
    import re
    import sys
    import threading
    from time import ctime, sleep
    from lxml import etree
    import pprint
    import requests
    from selenium import webdriver
    
    f = open('spider_北上广深_district.txt', 'r', encoding='utf-8')
    f.closed
    POOL_URL_DISTRICT_LIST = []
    for i in f:
        d = i.replace('
    ', '').replace(' ', '').split('"')
        for ii in d:
            if ii.find('http') > -1:
                POOL_URL_DISTRICT_LIST.append(ii)
    
    POOL_URL_DISTRICT_MAXPAGE_NUM_DIC = {}
    
    res_dic = {}
    
    # https://m.lianjia.com/gz/ershoufang/tianhequ/pg23/
    # https://m.lianjia.com/sz/ershoufang/longhuaqu/pg23/
    # https://m.lianjia.com/bj/ershoufang/haidianqu/pg34/
    # 'https://sz.lianjia.com/ershoufang/futianqu/'
    # 'https://gz.lianjia.com/ershoufang/panyu/'
    # 'https://bj.lianjia.com/ershoufang/miyun/'
    
    POOL_URL_DISTRICT_LIST_B = []
    MAX_PAGE_NUM = 100
    
    
    def gen_url(num=MAX_PAGE_NUM):
        for url in POOL_URL_DISTRICT_LIST:
            l = url.split('//')[1].split('lianjia')
            [city, district] = l
            city = city[:-1]
            district = district.split('.com/')[1]
            if city != 'sh':
                url_ = '%s%s/%s' % ('https://m.lianjia.com/', city, district)
            else:
                url_ = '%s%s/' % ('http://m.sh.lianjia.com/', district)
    
            POOL_URL_DISTRICT_MAXPAGE_NUM_DIC[url_] = num
            POOL_URL_DISTRICT_LIST_B.append(url_)
    
    
    gen_url()
    exception_url_list = []
    
    URL_NUM_EACH_THREAD = 100 * 0.6 * 4
    res_dic = {}
    
    # QPS_TIME_UNIT_B = 2
    
    todo_url_list = []
    for url_ in POOL_URL_DISTRICT_LIST_B:
        if url_.find('sh.') > -1:
            page_addition = 'd'
        else:
            page_addition = 'pg'
        for page_num in range(1, MAX_PAGE_NUM, 1):
            url = '%s%s%s/' % (url_, page_addition, page_num)
            todo_url_list.append(url)
    
    LEN = len(todo_url_list)
    
    browser = webdriver.Firefox()
    
    
    def grab_todo_url_list(browser):
        global res_dic, todo_url_list
    
        d = random.randint(1, 2)
        if d % 2 == 0:
            todo_url_list.reverse()
    
        my_control = len(todo_url_list)
        my_control_start = random.randint(0, my_control)
        for i in range(my_control_start, my_control, 1):
            if len(todo_url_list) > i:
                url = todo_url_list[i]
            if url not in todo_url_list:
                continue
            sleep(1)
            browser.get(url)
            html = browser.page_source
            web_site = ''
            url_pass_flag = 0
            if html.find('price_total') > -1:
                selector = etree.HTML(html)
                url_l = selector.xpath('//a[@class="a_mask"]/@href')
                des_l = selector.xpath('//div[@class="item_other text_cut"]/text()')
                price_total_l = selector.xpath('//span[@class="price_total"]/em/text()')
                unit_price_l = selector.xpath('//span[@class="unit_price"]/text()')
                url_pass_flag = 1
            elif html.find('xiaoquname') > -1:
                web_site = 'sh'
                selector = etree.HTML(html)
                url_l = selector.xpath('//ul[@class="fang-list"]/li/a/@href')
                xiaoquname_l = selector.xpath('//span[@class="xiaoquname"]/text()')
                area_l = selector.xpath('//p[@class="f-area"]/text()')
                price_total_l = selector.xpath('//span[@class="f-price"]/text()')
                url_pass_flag = 1
            # https://m.lianjia.com/bj/ershoufang/yizhuangkaifaqu/pg87
            # http://m.sh.lianjia.com/ershoufang/jinshan/d78
            elif html.find('搜索条件') > -1 or url.find('/lf/') > -1:
                print(111, url)
                url_pass_flag = 2
            if url_pass_flag == 1:
                res_dic[url] = {}
                len_l = len(url_l)
                res_dic[url]['items_list'] = []
                len_l_ = len_l - 1
                for i in range(0, len_l_, 1):
                    d = {}
                    d['spider_url'] = url
                    d['item_url'] = url_l[i]
                    d['des'] = des_l[i] if web_site == '' else '%s||%s' % (
                        area_l[i].replace('
    ', '').replace(',', '').replace(' ', ''),
                        xiaoquname_l[i].replace(',', '').replace(' ', ''))
                    d['price_total'] = price_total_l[i]
                    d['unit_price'] = unit_price_l[i] if web_site == '' else 'sh'
                    res_dic[url]['items_list'].append(d)
            if url_pass_flag != 0:
                if url in todo_url_list:
                    l_index = todo_url_list.index(url)
                    del todo_url_list[l_index]
            else:
                if url not in todo_url_list:
                    todo_url_list.append(url)
        browser.close()
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name=''):
            threading.Thread.__init__(self)
            self.name = name
            self.func = func
            self.args = args
    
        def run(self):
            self.func(self.args)
    
    
    MAX_EXCEPTION_URL_NUM = 0
    
    
    def deal_exception_url_list():
        global todo_url_list
        browser = webdriver.Firefox()
        if len(todo_url_list) > MAX_EXCEPTION_URL_NUM:
            grab_todo_url_list(browser)
        else:
            return
        deal_exception_url_list()
    
    
    POOL_URL_LEN_B = len(POOL_URL_DISTRICT_LIST_B)
    
    
    def main():
        print('starting at:', ctime())
        threads_list = []
        thread_sum = math.ceil(LEN / URL_NUM_EACH_THREAD)
        for nloop in range(0, thread_sum, 1):
            browser = webdriver.Firefox()
            thread_instance = MyThread(grab_todo_url_list, (browser), grab_todo_url_list.__name__)
            threads_list.append(thread_instance)
        # 主进程将在所有非守护进程退出后,退出
        for t in threads_list:
            t.setDaemon = False
            t.start()
        # wait for all thrades to finish
        for t in threads_list:
            t.join()
        # pprint.pprint(res_dic)
        deal_exception_url_list()
        print('end_r:', ctime())
        f_name = 'mobile_lianjia_ershoufang_BSGS.csv'
        f = open(f_name, 'w', encoding='utf-8-sig')
        str = 'spider_url,item_url,des,price_total,unit_price
    '
        f.write(str)
        f.closed
        f = open(f_name, 'a', encoding='utf-8-sig')
    
        for url in res_dic:
            try:
                for d in res_dic[url]['items_list']:
                    str = '%s,%s,%s,%s,%s
    ' % (d['spider_url'], d['item_url'], d['des'], d['price_total'], d['unit_price'])
                    f.write(str)
            except Exception:
                print(Exception)
        f.closed
        print('end_w:', ctime())
    
    
    if __name__ == '__main__':
        main()

    # -*- coding: UTF-8 -*-
    import math
    import random
    import sys
    import threading
    from time import ctime, sleep
    
    import requests
    
    MAX_PAGINATION = 100
    pagination = MAX_PAGINATION
    QPS = 50
    QPS_TIME_UNIT = 1
    # http://lbs.amap.com/api/webservice/guide/tools/info
    INFOCODE_OK = '10000'
    file_name_key_pool = 'key_pool.pool'
    KEY_POOL_LIST = []
    touse_key = ''
    f = open(file_name_key_pool, 'r', encoding='utf-8')
    for i in f:
        try:
            list_ = i.split('	')
            key = i.split('	')[1].split()
            KEY_POOL_LIST.append(key[0])
        except Exception:
            print(Exception)
    KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST)
    
    URL_TYPE = 'http://restapi.amap.com/v3/place/text'
    touse_key = ''
    keywords = '&keywords='
    OFFSET = '&offset=2'
    CITYLIMIT = '&citylimit=false'
    # 060100    购物服务    商场    商场
    # 060101    购物服务    商场    购物中心
    # 060102    购物服务    商场    普通商场
    # 060400    购物服务    超级市场    超市
    
    
    POI_TYPES = '&types=060100|060101|060102|060400'
    URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
    
    change_key_qps = 0
    
    
    def change_key():
        global touse_key, change_key_qps
        # 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
        mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
        for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
            key = KEY_POOL_LIST[i]
            if key == touse_key:
                if i == KEY_POOL_NUM_INDICATOR:
                    change_key()
                    return
                else:
                    continue
            touse_key = key
            url = URL_FOR_CHANGE_KEY % (touse_key)
            try:
                change_key_qps += 1
                if change_key_qps % QPS == 0:
                    sleep(QPS_TIME_UNIT)
                r = requests.get(url)
                json_ = r.json()
            except Exception:
                print('requests.get(url)', Exception)
                change_key()
                return
            infocode = json_['infocode']
            if not infocode == INFOCODE_OK:
                if i == KEY_POOL_NUM_INDICATOR:
                    sys.exit('NOInvalidKEY')
                change_key()
                return
            return
    
    
    FNAME = '【商场任务】28个城市_任务列表_20170727 - 副本.csv'
    tosupply_dic = {}
    todo_list = []
    fo = open(FNAME, 'r', encoding='gbk')
    file_line_num = 0
    for i in fo:
        file_line_num += 1
        if file_line_num == 1:
            continue
        todo_list.append(file_line_num)
        tosupply_dic[file_line_num] = {}
        l = i.replace('
    ', '').replace(',,,,,,,', '').split(',')
        dic_ = {}
        dic_['sequence_number'] = l[0]
        dic_['type'] = l[1]
        dic_['city'] = l[2]
        dic_['district'] = l[3]
        dic_['address'] = l[4]
        dic_['name'] = l[5]
        dic_['gd_type_1'], dic_['gd_type_2'], dic_['gd_type_3'], dic_['gd_name'], dic_['gd_province'], dic_['gd_city'], 
        dic_['gd_district'], dic_['gd_address'] = ['', '', '', '', '', '', '', '']
        tosupply_dic[file_line_num] = dic_
    LEN = len(todo_list)
    EACH_THREAD_REQUEST_NUM = 30
    
    requests_counter = 0
    tosupply_dic_len = len(tosupply_dic)
    tosupply_dic_len_ = tosupply_dic_len - 1
    
    
    def supply_dic(nloop):
        global tosupply_dic, requests_counter, todo_list
        print(len(todo_list))
        d = random.randint(1, 2)
        if d % 2 == 0:
            todo_list.reverse()
    
        for file_line_num in todo_list:
            if file_line_num not in todo_list:
                continue
            t = threading.current_thread()
            print('nloop=',nloop)
            print(' t._ident=',t._ident)
            dic_ = tosupply_dic[file_line_num]
            city = dic_['district']
            name = dic_['name']
            url = '%s?key=%s&keywords=%s&city=%s%s%s%s' % (URL_TYPE, touse_key, name, city, POI_TYPES, OFFSET, CITYLIMIT)
            if requests_counter % QPS == 0:
                sleep(QPS_TIME_UNIT)
            try:
                r = requests.get(url)
                r_json = r.json()
            except Exception:
                if file_line_num not in todo_list:
                    todo_list.append(file_line_num)
                continue
            infocode = r_json['infocode']
            if infocode == '10000':
                count = r_json['count']
                if int(count) > 0:
                    pois_list = r_json['pois']
                    pos_dic = pois_list[0]
                    tosupply_dic[file_line_num]['gd_type_one'] = pos_dic['type']
                    tosupply_dic[file_line_num]['gd_type_1'], tosupply_dic[file_line_num]['gd_type_2'], 
                    tosupply_dic[file_line_num]['gd_type_3'] = pos_dic['type'].split('|')[0].split(';')
                    tosupply_dic[file_line_num]['gd_province'] = pos_dic['pname']
                    tosupply_dic[file_line_num]['gd_city'] = pos_dic['cityname']
                    tosupply_dic[file_line_num]['gd_district'] = pos_dic['adname']
                    tosupply_dic[file_line_num]['gd_address'] = pos_dic['address']
                elif int(count) == 0:
                    tosupply_dic[file_line_num]['gd_name'] = 'GD-NO-DATA'
                if file_line_num in todo_list:
                    list_index = todo_list.index(file_line_num)
                    del todo_list[list_index]
                print(file_line_num)
            else:
                if file_line_num not in todo_list:
                    todo_list.append(file_line_num)
                change_key()
    
    
    MAX_EXCEPTION_URL_NUM = 0
    
    
    def deal_exception_list():
        global todo_list
        print(todo_list)
        if len(todo_list) > MAX_EXCEPTION_URL_NUM:
            supply_dic()
        else:
            return
        deal_exception_list()
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name=''):
            threading.Thread.__init__(self)
            self.name = name
            self.func = func
            self.args = args
    
        def run(self):
            self.func(self.args)
    
    
    def main():
        print('starting at:', ctime())
        threads_list = []
        thread_sum = math.ceil(tosupply_dic_len / EACH_THREAD_REQUEST_NUM)
        print(185, thread_sum)
        for nloop in range(0, thread_sum, 1):
            thread_instance = MyThread(supply_dic,(nloop),supply_dic.__name__)
            threads_list.append(thread_instance)
        # 主进程将在所有非守护进程退出后,退出
        for t in threads_list:
            t.setDaemon = False
            t.start()
        # wait for all thrades to finish
        for t in threads_list:
            t.join()
        deal_exception_list()
    
        FGEN = 'GEN_28.csv'
        fo = open(FGEN, 'w', encoding='utf-8-sig')
        fo.write(
            '序号,类别编号,城市名称,区域    地址,商圈名,gd_type_one,gd_type_1,gd_type_2,gd_type_3,gd_name,gd_province,gd_city,gd_district,gd_address
    ')
        fo.closed
        fo = open(FGEN, 'a', encoding='utf-8-sig')
        for file_line_num in tosupply_dic:
            if file_line_num == 1:
                continue
            dic_ = tosupply_dic[file_line_num]
            str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s
    ' % (
                dic_['sequence_number'], dic_['type'], dic_['city'], dic_['district'], dic_['address'],
                dic_['name'], dic_['gd_type_one'], dic_['gd_type_1'], dic_['gd_type_2'], dic_['gd_type_3'], dic_['gd_name'],
                dic_['gd_province'],
                dic_['gd_city'],
                dic_['gd_district'],
                dic_['gd_address'])
            fo.write(str)
        fo.closed
    
    
    if __name__ == '__main__':
        main()

    16G内存OK   4G内存 --- no

    # -*- coding: UTF-8 -*-
    import math
    import random
    import sys
    import threading
    from time import ctime, sleep
    
    import requests
    
    # fo = open('电影院任务列表_20170724_新增列 - 副本 (12).csv', 'r', encoding='utf-8')
    # file_line_num = 0
    # for i in fo:
    #     if file_line_num == 0:
    #         continue
    #     l = i.replace('
    ','').split(',')
    #
    # ddd = 5
    # MAX_OFFSET = 25
    # OFFSET = MAX_OFFSET - 1
    MAX_PAGINATION = 100
    pagination = MAX_PAGINATION
    QPS = 50
    QPS_TIME_UNIT = 1
    # http://lbs.amap.com/api/webservice/guide/tools/info
    INFOCODE_OK = '10000'
    file_name_key_pool = 'key_pool.pool'
    KEY_POOL_LIST = []
    touse_key = ''
    f = open(file_name_key_pool, 'r', encoding='utf-8')
    for i in f:
        try:
            list_ = i.split('	')
            key = i.split('	')[1].split()
            KEY_POOL_LIST.append(key[0])
        except Exception:
            print(Exception)
    KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST)
    
    # http://restapi.amap.com/v3/place/text?key=&keywords=奥斯卡漯河店&types=080601&city=漯河&offset=1
    # http://restapi.amap.com/v3/place/text?key=&keywords=奥斯卡漯河店&types=080601&city=漯河&offset=2
    # 2得到的结果更接近期望值
    URL_TYPE = 'http://restapi.amap.com/v3/place/text'
    touse_key = ''
    keywords = '&keywords='
    OFFSET = '&offset=2'
    ## 己方数据  city-name 不匹配 citylimit true-->false
    # 华中区    湖北    钟祥                    电影院    横店荆门店    横店影视股份有限公司    0    GD-NO-DATA
    CITYLIMIT = '&citylimit=false'
    # 080601    体育休闲服务    影剧院    电影院
    POI_TYPES = '&types=080601'
    URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
    
    change_key_qps = 0
    
    
    def change_key():
        global touse_key, change_key_qps
        # 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
        mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
        for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
            key = KEY_POOL_LIST[i]
            if key == touse_key:
                if i == KEY_POOL_NUM_INDICATOR:
                    change_key()
                    return
                else:
                    continue
            touse_key = key
            url = URL_FOR_CHANGE_KEY % (touse_key)
            try:
                change_key_qps += 1
                if change_key_qps % QPS == 0:
                    sleep(QPS_TIME_UNIT)
                r = requests.get(url)
                json_ = r.json()
            except Exception:
                print('requests.get(url)', Exception)
                change_key()
                return
            infocode = json_['infocode']
            if not infocode == INFOCODE_OK:
                if i == KEY_POOL_NUM_INDICATOR:
                    sys.exit('NOInvalidKEY')
                change_key()
                return
            return
    
    
    FNAME = '电影院任务列表_20170724.csv'
    tosupply_dic = {}
    fo = open(FNAME, 'r', encoding='gbk')
    file_line_num = 0
    for i in fo:
        file_line_num += 1
        if file_line_num == 1:
            continue
        tosupply_dic[file_line_num] = {}
        is_from_past_line = 1 if len(i.split(',,,,,,,,')) > 1 else 0
        l = i.replace('
    ', '').replace(',,,,,,,', '').split(',')
        dic_ = {}
        dic_['sequence_number'] = l[0]
        dic_['area'] = l[1] if is_from_past_line == 0 else  tosupply_dic[file_line_num - 1]['area']
        dic_['province'] = l[2] if is_from_past_line == 0 else  tosupply_dic[file_line_num - 1][
            'province']
        dic_['city'] = l[3] if is_from_past_line == 0 else  tosupply_dic[file_line_num - 1]['city']
        dic_['district'] = ''
        dic_['address'] = ''
        dic_['buliding'] = ''
        dic_['longitude_latitude'] = ''
        dic_['busniess_type'] = l[8] if is_from_past_line == 0 else l[1]
        dic_['name'] = l[9] if is_from_past_line == 0 else l[2]
        dic_['parent_company'] = l[10] if is_from_past_line == 0 else  tosupply_dic[file_line_num - 1]['parent_company']
        dic_['is_from_past_line'] = is_from_past_line
        dic_['gd_name'] = ''
        dic_['gd_city'] = ''
        tosupply_dic[file_line_num] = dic_
    
    EACH_THREAD_REQUEST_NUM = 30
    exception_line_num_list = []
    
    # keywords = '&keywords='
    # OFFSET = '&offset=1'
    # CITYLIMIT = '&citylimit=true'
    # # 080601    体育休闲服务    影剧院    电影院
    # POI_TYPES = '&types=080601'
    #
    # http://restapi.amap.com/v3/place/text?key=&keywords=重庆万盛DAV国际影城&city=重庆&types=080601&offset=1&citylimit=true
    requests_counter = 0
    tosupply_dic_len = len(tosupply_dic)
    tosupply_dic_len_ = tosupply_dic_len - 1
    
    
    # thread_strat_file_line_num
    def supply_dic(thread_strat_file_line_num):
        global tosupply_dic, requests_counter, tosupply_dic_len_, exception_line_num_list
        for loop in range(0, EACH_THREAD_REQUEST_NUM, 1):
            file_line_num = thread_strat_file_line_num + loop
            if file_line_num - 2 > tosupply_dic_len_:
                return
            if file_line_num < 2 or len(tosupply_dic[file_line_num]['district']) > 0:
                if file_line_num in exception_line_num_list:
                    list_index = exception_line_num_list.index(file_line_num)
                    del exception_line_num_list[list_index]
                continue
            dic_ = tosupply_dic[file_line_num]
            city = dic_['city']
            name = dic_['name']
            url = '%s?key=%s&keywords=%s&city=%s%s%s%s' % (URL_TYPE, touse_key, name, city, POI_TYPES, OFFSET, CITYLIMIT)
            if requests_counter % QPS == 0:
                sleep(QPS_TIME_UNIT)
            try:
                r = requests.get(url)
                r_json = r.json()
            except Exception:
                if file_line_num not in exception_line_num_list:
                    exception_line_num_list.append(file_line_num)
                continue
            infocode = r_json['infocode']
            if infocode == '10000':
                count = r_json['count']
                if int(count) > 0:
                    pois_list = r_json['pois']
                    pos_dic = pois_list[0]
                    tosupply_dic[file_line_num]['district'] = pos_dic['adname']
    
                    if len(pos_dic['address']) <= 2:
                        print(pos_dic)
                        print(pos_dic['address'])
                    tosupply_dic[file_line_num]['address'] = pos_dic['address'] if len(pos_dic['address']) > 2 else '高德缺地址'
                    if len(pos_dic['address']) <= 2:
                        print(tosupply_dic[file_line_num]['address'])
                    tosupply_dic[file_line_num]['longitude_latitude'] = pos_dic['location']
                    tosupply_dic[file_line_num]['gd_name'] = pos_dic['name']
                    tosupply_dic[file_line_num]['gd_city'] = pos_dic['cityname']
    
    
                elif int(count) == 0:
                    tosupply_dic[file_line_num]['gd_name'] = 'GD-NO-DATA'
                if file_line_num in exception_line_num_list:
                    list_index = exception_line_num_list.index(file_line_num)
                    del exception_line_num_list[list_index]
            else:
                if file_line_num not in exception_line_num_list:
                    exception_line_num_list.append(file_line_num)
                change_key()
    
    
    MAX_EXCEPTION_URL_NUM = 0
    
    
    def deal_exception_list():
        global exception_line_num_list
        print(exception_line_num_list)
        if len(exception_line_num_list) > MAX_EXCEPTION_URL_NUM:
            for thread_strat_file_line_num in exception_line_num_list:
                supply_dic(thread_strat_file_line_num)
        else:
            return
        deal_exception_list()
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name=''):
            threading.Thread.__init__(self)
            self.name = name
            self.func = func
            self.args = args
    
        def run(self):
            self.func(self.args)
    
    
    # 通灌北路58号苏宁广场7层701
    # 林源路创基尚城B区3层
    # 民治书香门第上河坊2栋2层
    # 东欣大道东欣广场城市综合体E01四楼
    # 横栏镇茂辉工业区乐丰四路21号永耀商业广场B幢之四(永耀人才市场旁)
    # 信江路西侧金峰城市广场1栋5层502(鹏泰购物广场5楼、迪欧咖啡楼上)
    # 上陡门学院中路人才大厦一层(东瓯影城)
    # 丰庆路710号(世纪联华超市4楼)
    # 风度中路13号百老汇商业城5层(美特斯邦威楼上)
    # 龙阳路2000号(龙阳广场5层)
    # 容城大道东12号(容城天骄写字楼3层)
    # 解放大道387号(汉口宗关水厂)南国西汇城市广场二期5层
    # 后沙峪镇安泰大街9号院(中粮祥云小镇)7号楼2层
    # 华强新天地3楼华时代美食城门口(横店电影院门口)
    # 南三环西路16号1号楼首地大峡谷购物中心5层
    # 永兴路7号院1号楼龙湖北京大兴天街购物中心L3层Z2
    def gen_building(str):
        start_ = 0
        end_ = len(str)
        res = ''
        start_flag = 0
        end_flag = 0
        if str.find('') > -1:
            start_ = str.find('') + 1
            start_flag = 1
        elif str.find('交汇处') > -1:
            start_ = str.find('交汇处') + 1 + 2
            start_flag = 1
        elif str.find('交叉口') > -1:
            start_ = str.find('交叉口') + 1 + 2
            start_flag = 1
        elif str.find('') > -1:
            start_ = str.find('') + 1
            start_flag = 1
        elif str.find('') > -1:
            start_ = str.find('') + 1
            start_flag = 1
    
        if str.find('') > -1:
            end_ = str.find('') + 1
            end_flag = 1
        elif str.find('') > -1:
            end_ = str.find('') + 1
            end_flag = 1
        if start_flag == 1 or end_flag == 1:
            res = ''.join((list(str)[start_:end_]))
            if res.find('(') > -1 or res.find('') > -1:
                # new rule
                res = res.replace('(', '').replace(')', '').replace('', '').replace('', '')
        return res
    
    
    def main():
        print('starting at:', ctime())
        threads_list = []
        thread_sum = math.ceil(tosupply_dic_len / EACH_THREAD_REQUEST_NUM)
        print(185, thread_sum)
        for nloop in range(0, thread_sum, 1):
            thread_strat_file_line_num = nloop * EACH_THREAD_REQUEST_NUM
            print(thread_strat_file_line_num)
            thread_instance = MyThread(supply_dic, (thread_strat_file_line_num), supply_dic.__name__)
            threads_list.append(thread_instance)
            # 主进程将在所有非守护进程退出后,退出
        for t in threads_list:
            t.setDaemon = False
            t.start()
        # wait for all thrades to finish
        for t in threads_list:
            t.join()
        # pprint.pprint(res_dic)
    
    
        deal_exception_list()
        for i in exception_line_num_list:
            print('EXCEPTION', i)
        FGEN = '电影院任务列表_20170724_新增列_已计算楼宇.csv'
        fo = open(FGEN, 'w', encoding='utf-8-sig')
        fo.write('序号,地理区域,省份,城市,区域,地址,所属楼宇,经纬度,商圈属性,店铺名,所属院线,is_from_past_line,gd_name,gd_city
    ')
        fo.closed
        fo = open(FGEN, 'a', encoding='utf-8-sig')
        for file_line_num in tosupply_dic:
            if file_line_num == 1:
                continue
            dic_ = tosupply_dic[file_line_num]
            str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s
    ' % (
                dic_['sequence_number'], dic_['area'], dic_['province'], dic_['city'], dic_['district'],
                dic_['address'].replace(',', '  '),
                gen_building(dic_['address']), dic_['longitude_latitude'].replace(',', ' '), dic_['busniess_type'],
                dic_['name'],
                dic_['parent_company'],
                dic_['is_from_past_line'], dic_['gd_name'], dic_['gd_city'])
            fo.write(str)
        fo.closed
    
    
    if __name__ == '__main__':
        main()
    # -*- coding: UTF-8 -*-
    import math
    import random
    import sys
    import threading
    from time import ctime, sleep
    
    import requests
    
    # fo = open('电影院任务列表_20170724_新增列 - 副本 (12).csv', 'r', encoding='utf-8')
    # file_line_num = 0
    # for i in fo:
    #     if file_line_num == 0:
    #         continue
    #     l = i.replace('
    ','').split(',')
    #
    # ddd = 5
    # MAX_OFFSET = 25
    # OFFSET = MAX_OFFSET - 1
    MAX_PAGINATION = 100
    pagination = MAX_PAGINATION
    QPS = 50
    QPS_TIME_UNIT = 1
    # http://lbs.amap.com/api/webservice/guide/tools/info
    INFOCODE_OK = '10000'
    file_name_key_pool = 'key_pool.pool'
    KEY_POOL_LIST = []
    touse_key = ''
    f = open(file_name_key_pool, 'r', encoding='utf-8')
    for i in f:
        try:
            list_ = i.split('	')
            key = i.split('	')[1].split()
            KEY_POOL_LIST.append(key[0])
        except Exception:
            print(Exception)
    KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST)
    
    # http://restapi.amap.com/v3/place/text?key=&keywords=奥斯卡漯河店&types=080601&city=漯河&offset=1
    # http://restapi.amap.com/v3/place/text?key=&keywords=奥斯卡漯河店&types=080601&city=漯河&offset=2
    # 2得到的结果更接近期望值
    URL_TYPE = 'http://restapi.amap.com/v3/place/text'
    touse_key = ''
    keywords = '&keywords='
    OFFSET = '&offset=2'
    ## 己方数据  city-name 不匹配 citylimit true-->false
    # 华中区	湖北	钟祥					电影院	横店荆门店	横店影视股份有限公司	0	GD-NO-DATA
    CITYLIMIT = '&citylimit=false'
    # 080601	体育休闲服务	影剧院	电影院
    POI_TYPES = '&types=080601'
    URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
    
    change_key_qps = 0
    
    
    def change_key():
        global touse_key, change_key_qps
        # 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
        mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
        for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
            key = KEY_POOL_LIST[i]
            if key == touse_key:
                if i == KEY_POOL_NUM_INDICATOR:
                    change_key()
                    return
                else:
                    continue
            touse_key = key
            url = URL_FOR_CHANGE_KEY % (touse_key)
            try:
                change_key_qps += 1
                if change_key_qps % QPS == 0:
                    sleep(QPS_TIME_UNIT)
                r = requests.get(url)
                json_ = r.json()
            except Exception:
                print('requests.get(url)', Exception)
                change_key()
                return
            infocode = json_['infocode']
            if not infocode == INFOCODE_OK:
                if i == KEY_POOL_NUM_INDICATOR:
                    sys.exit('NOInvalidKEY')
                change_key()
                return
            return
    
    
    FNAME = '电影院任务列表_20170724.csv'
    tosupply_dic = {}
    fo = open(FNAME, 'r', encoding='gbk')
    file_line_num = 0
    for i in fo:
        file_line_num += 1
        if file_line_num == 1:
            continue
        tosupply_dic[file_line_num] = {}
        is_from_past_line = 1 if len(i.split(',,,,,,,,')) > 1 else 0
        l = i.replace('
    ', '').replace(',,,,,,,', '').split(',')
        dic_ = {}
        dic_['sequence_number'] = l[0]
        dic_['area'] = l[1] if is_from_past_line == 0 else  tosupply_dic[file_line_num - 1]['area']
        dic_['province'] = l[2] if is_from_past_line == 0 else  tosupply_dic[file_line_num - 1][
            'province']
        dic_['city'] = l[3] if is_from_past_line == 0 else  tosupply_dic[file_line_num - 1]['city']
        dic_['district'] = ''
        dic_['address'] = ''
        dic_['buliding'] = ''
        dic_['longitude_latitude'] = ''
        dic_['busniess_type'] = l[8] if is_from_past_line == 0 else l[1]
        dic_['name'] = l[9] if is_from_past_line == 0 else l[2]
        dic_['parent_company'] = l[10] if is_from_past_line == 0 else  tosupply_dic[file_line_num - 1]['parent_company']
        dic_['is_from_past_line'] = is_from_past_line
        dic_['gd_name'] = ''
        dic_['gd_city'] = ''
        tosupply_dic[file_line_num] = dic_
    
    EACH_THREAD_REQUEST_NUM = 30
    exception_line_num_list = []
    
    # keywords = '&keywords='
    # OFFSET = '&offset=1'
    # CITYLIMIT = '&citylimit=true'
    # # 080601	体育休闲服务	影剧院	电影院
    # POI_TYPES = '&types=080601'
    #
    # http://restapi.amap.com/v3/place/text?key=&keywords=重庆万盛DAV国际影城&city=重庆&types=080601&offset=1&citylimit=true
    requests_counter = 0
    tosupply_dic_len = len(tosupply_dic)
    tosupply_dic_len_ = tosupply_dic_len - 1
    
    
    # thread_strat_file_line_num
    def supply_dic(thread_strat_file_line_num):
        global tosupply_dic, requests_counter, tosupply_dic_len_, exception_line_num_list
        for loop in range(0, EACH_THREAD_REQUEST_NUM, 1):
            file_line_num = thread_strat_file_line_num + loop
            if file_line_num - 2 > tosupply_dic_len_:
                return
            if file_line_num < 2 or len(tosupply_dic[file_line_num]['district']) > 0:
                if file_line_num in exception_line_num_list:
                    list_index = exception_line_num_list.index(file_line_num)
                    del exception_line_num_list[list_index]
                continue
            dic_ = tosupply_dic[file_line_num]
            city = dic_['city']
            name = dic_['name']
            url = '%s?key=%s&keywords=%s&city=%s%s%s%s' % (URL_TYPE, touse_key, name, city, POI_TYPES, OFFSET, CITYLIMIT)
            if requests_counter % QPS == 0:
                sleep(QPS_TIME_UNIT)
            try:
                r = requests.get(url)
                r_json = r.json()
            except Exception:
                if file_line_num not in exception_line_num_list:
                    exception_line_num_list.append(file_line_num)
                continue
            infocode = r_json['infocode']
            if infocode == '10000':
                count = r_json['count']
                if int(count) > 0:
                    pois_list = r_json['pois']
                    pos_dic = pois_list[0]
                    tosupply_dic[file_line_num]['district'] = pos_dic['adname']
    
                    if len(pos_dic['address']) <= 2:
                        print(pos_dic)
                        print(pos_dic['address'])
                    tosupply_dic[file_line_num]['address'] = pos_dic['address'] if len(pos_dic['address']) > 2 else '高德缺地址'
                    if len(pos_dic['address']) <= 2:
                        print(tosupply_dic[file_line_num]['address'])
                    tosupply_dic[file_line_num]['longitude_latitude'] = pos_dic['location']
                    tosupply_dic[file_line_num]['gd_name'] = pos_dic['name']
                    tosupply_dic[file_line_num]['gd_city'] = pos_dic['cityname']
    
    
                elif int(count) == 0:
                    tosupply_dic[file_line_num]['gd_name'] = 'GD-NO-DATA'
                if file_line_num in exception_line_num_list:
                    list_index = exception_line_num_list.index(file_line_num)
                    del exception_line_num_list[list_index]
            else:
                if file_line_num not in exception_line_num_list:
                    exception_line_num_list.append(file_line_num)
                change_key()
    
    
    MAX_EXCEPTION_URL_NUM = 0
    
    
    def deal_exception_list():
        global exception_line_num_list
        print(exception_line_num_list)
        if len(exception_line_num_list) > MAX_EXCEPTION_URL_NUM:
            for thread_strat_file_line_num in exception_line_num_list:
                supply_dic(thread_strat_file_line_num)
        else:
            return
        deal_exception_list()
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name=''):
            threading.Thread.__init__(self)
            self.name = name
            self.func = func
            self.args = args
    
        def run(self):
            self.func(self.args)
    
    
    # 通灌北路58号苏宁广场7层701
    # 林源路创基尚城B区3层
    # 民治书香门第上河坊2栋2层
    # 东欣大道东欣广场城市综合体E01四楼
    # 横栏镇茂辉工业区乐丰四路21号永耀商业广场B幢之四(永耀人才市场旁)
    # 信江路西侧金峰城市广场1栋5层502(鹏泰购物广场5楼、迪欧咖啡楼上)
    # 上陡门学院中路人才大厦一层(东瓯影城)
    # 丰庆路710号(世纪联华超市4楼)
    # 风度中路13号百老汇商业城5层(美特斯邦威楼上)
    # 龙阳路2000号(龙阳广场5层)
    # 容城大道东12号(容城天骄写字楼3层)
    # 解放大道387号(汉口宗关水厂)南国西汇城市广场二期5层
    # 后沙峪镇安泰大街9号院(中粮祥云小镇)7号楼2层
    # 华强新天地3楼华时代美食城门口(横店电影院门口)
    # 南三环西路16号1号楼首地大峡谷购物中心5层
    # 永兴路7号院1号楼龙湖北京大兴天街购物中心L3层Z2
    def gen_building(str):
        start_ = 0
        end_ = len(str)
        res = ''
        start_flag = 0
        end_flag = 0
        if str.find('号') > -1:
            start_ = str.find('号') + 1
            start_flag = 1
        elif str.find('交汇处') > -1:
            start_ = str.find('交汇处') + 1 + 2
            start_flag = 1
        elif str.find('交叉口') > -1:
            start_ = str.find('交叉口') + 1 + 2
            start_flag = 1
        elif str.find('路') > -1:
            start_ = str.find('路') + 1
            start_flag = 1
        elif str.find('道') > -1:
            start_ = str.find('道') + 1
            start_flag = 1
    
        if str.find('层') > -1:
            end_ = str.find('层') + 1
            end_flag = 1
        elif str.find('楼') > -1:
            end_ = str.find('楼') + 1
            end_flag = 1
        if start_flag == 1 or end_flag == 1:
            res = ''.join((list(str)[start_:end_]))
            if res.find('('):
                # new rule
                res = res.replace('(', '').replace(')', '')
        return res
    
    
    def main():
        print('starting at:', ctime())
        threads_list = []
        thread_sum = math.ceil(tosupply_dic_len / EACH_THREAD_REQUEST_NUM)
        print(185, thread_sum)
        for nloop in range(0, thread_sum, 1):
            thread_strat_file_line_num = nloop * EACH_THREAD_REQUEST_NUM
            print(thread_strat_file_line_num)
            thread_instance = MyThread(supply_dic, (thread_strat_file_line_num), supply_dic.__name__)
            threads_list.append(thread_instance)
            # 主进程将在所有非守护进程退出后,退出
        for t in threads_list:
            t.setDaemon = False
            t.start()
        # wait for all thrades to finish
        for t in threads_list:
            t.join()
        # pprint.pprint(res_dic)
    
    
        deal_exception_list()
        for i in exception_line_num_list:
            print('EXCEPTION', i)
        FGEN = '电影院任务列表_20170724_新增列_已计算楼宇.csv'
        fo = open(FGEN, 'w', encoding='utf-8-sig')
        fo.write('序号,地理区域,省份,城市,区域,地址,所属楼宇,经纬度,商圈属性,店铺名,所属院线,is_from_past_line,gd_name,gd_city
    ')
        fo.closed
        fo = open(FGEN, 'a', encoding='utf-8-sig')
        for file_line_num in tosupply_dic:
            if file_line_num == 1:
                continue
            dic_ = tosupply_dic[file_line_num]
            str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s
    ' % (
                dic_['sequence_number'], dic_['area'], dic_['province'], dic_['city'], dic_['district'],
                dic_['address'].replace(',', '  '),
                gen_building(dic_['address']), dic_['longitude_latitude'].replace(',', ' '), dic_['busniess_type'],
                dic_['name'],
                dic_['parent_company'],
                dic_['is_from_past_line'], dic_['gd_name'], dic_['gd_city'])
            fo.write(str)
        fo.closed
    
    
    if __name__ == '__main__':
        main()
    

      

    高德缺地址
    {'id': 'B0FFGJYCPN', 'name': '横店电影城(原阳店)', 'type': '体育休闲服务;影剧院;电影院', 'typecode': '080601', 'biz_type': 'cinema', 'address': [], 'location': '113.960307,35.065736', 'tel': '0373-5911199', 'distance': [], 'biz_ext': [], 'pname': '河南省', 'cityname': '新乡市', 'adname': '原阳县', 'importance': [], 'shopid': [], 'shopinfo': '0', 'poiweight': []}
    []
    高德缺地址
    {'id': 'B0FFH0368R', 'name': '横店电影城', 'type': '体育休闲服务;影剧院;电影院', 'typecode': '080601', 'biz_type': 'cinema', 'address': [], 'location': '118.430053,29.862110', 'tel': [], 'distance': [], 'biz_ext': [], 'pname': '安徽省', 'cityname': '黄山市', 'adname': '歙县', 'importance': [], 'shopid': [], 'shopinfo': '0', 'poiweight': []}
    []
    高德缺地址
    [300, 30, 2, 390, 330, 480, 570, 900, 840, 450, 1140, 630, 990, 180, 1050, 90, 240, 360, 720, 750, 690, 1170, 60, 1230, 960, 210, 1200, 930, 510, 150, 600, 870, 1080, 810, 660, 540, 270, 420, 1110, 120, 780, 1020, 1113, 813, 544, 69, 94, 119]
    [390, 330, 570, 840, 1140, 990, 1050, 240, 720, 690, 60, 960, 1200, 510, 600, 1080, 660, 270, 1110, 780, 1113]
    [330, 840, 990, 240, 690, 960, 510, 1080, 270, 780]
    [840, 240, 960, 1080, 780]
    [240, 1080]
    [1080]
    []
    # -*- coding: UTF-8 -*-
    import re
    import pprint
    import json
    import time
    import math
    import sys
    import requests
    import threading
    from time import ctime, sleep
    import random
    
    # fo = open('电影院任务列表_20170724_新增列 - 副本 (12).csv', 'r', encoding='utf-8')
    # file_line_num = 0
    # for i in fo:
    #     if file_line_num == 0:
    #         continue
    #     l = i.replace('
    ','').split(',')
    #
    # ddd = 5
    # MAX_OFFSET = 25
    # OFFSET = MAX_OFFSET - 1
    MAX_PAGINATION = 100
    pagination = MAX_PAGINATION
    QPS = 50
    QPS_TIME_UNIT = 1
    # http://lbs.amap.com/api/webservice/guide/tools/info
    INFOCODE_OK = '10000'
    file_name_key_pool = 'key_pool.pool'
    KEY_POOL_LIST = []
    touse_key = ''
    f = open(file_name_key_pool, 'r', encoding='utf-8')
    for i in f:
        try:
            list_ = i.split('	')
            key = i.split('	')[1].split()
            KEY_POOL_LIST.append(key[0])
        except Exception:
            print(Exception)
    KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST)
    
    # http://restapi.amap.com/v3/place/text?key=&keywords=奥斯卡漯河店&types=080601&city=漯河&offset=1
    # http://restapi.amap.com/v3/place/text?key=&keywords=奥斯卡漯河店&types=080601&city=漯河&offset=2
    # 2得到的结果更接近期望值
    URL_TYPE = 'http://restapi.amap.com/v3/place/text'
    touse_key = ''
    keywords = '&keywords='
    OFFSET = '&offset=2'
    ## 己方数据  city-name 不匹配 citylimit true-->false
    # 华中区    湖北    钟祥                    电影院    横店荆门店    横店影视股份有限公司    0    GD-NO-DATA
    CITYLIMIT = '&citylimit=false'
    # 080601    体育休闲服务    影剧院    电影院
    POI_TYPES = '&types=080601'
    URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
    
    change_key_qps = 0
    
    
    def change_key():
        global touse_key, change_key_qps
        # 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
        mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
        for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
            key = KEY_POOL_LIST[i]
            if key == touse_key:
                if i == KEY_POOL_NUM_INDICATOR:
                    change_key()
                    return
                else:
                    continue
            touse_key = key
            url = URL_FOR_CHANGE_KEY % (touse_key)
            try:
                change_key_qps += 1
                if change_key_qps % QPS == 0:
                    sleep(QPS_TIME_UNIT)
                r = requests.get(url)
                json_ = r.json()
            except Exception:
                print('requests.get(url)', Exception)
                change_key()
                return
            infocode = json_['infocode']
            if not infocode == INFOCODE_OK:
                if i == KEY_POOL_NUM_INDICATOR:
                    sys.exit('NOInvalidKEY')
                change_key()
                return
            return
    
    
    FNAME = '电影院任务列表_20170724.csv'
    tosupply_dic = {}
    fo = open(FNAME, 'r', encoding='gbk')
    file_line_num = 0
    for i in fo:
        file_line_num += 1
        if file_line_num == 1:
            continue
        tosupply_dic[file_line_num] = {}
        is_from_past_line = 1 if len(i.split(',,,,,,,,')) > 1 else 0
        l = i.replace('
    ', '').replace(',,,,,,,', '').split(',')
        dic_ = {}
        dic_['sequence_number'] = l[0]
        dic_['area'] = l[1] if is_from_past_line == 0 else  tosupply_dic[file_line_num - 1]['area']
        dic_['province'] = l[2] if is_from_past_line == 0 else  tosupply_dic[file_line_num - 1][
            'province']
        dic_['city'] = l[3] if is_from_past_line == 0 else  tosupply_dic[file_line_num - 1]['city']
        dic_['district'] = ''
        dic_['address'] = ''
        dic_['buliding'] = ''
        dic_['longitude_latitude'] = ''
        dic_['busniess_type'] = l[8] if is_from_past_line == 0 else l[1]
        dic_['name'] = l[9] if is_from_past_line == 0 else l[2]
        dic_['parent_company'] = l[10] if is_from_past_line == 0 else  tosupply_dic[file_line_num - 1]['parent_company']
        dic_['is_from_past_line'] = is_from_past_line
        dic_['gd_name'] = ''
        dic_['gd_city'] = ''
        tosupply_dic[file_line_num] = dic_
    
    EACH_THREAD_REQUEST_NUM = 30
    exception_line_num_list = []
    
    # keywords = '&keywords='
    # OFFSET = '&offset=1'
    # CITYLIMIT = '&citylimit=true'
    # # 080601    体育休闲服务    影剧院    电影院
    # POI_TYPES = '&types=080601'
    #
    # http://restapi.amap.com/v3/place/text?key=&keywords=重庆万盛DAV国际影城&city=重庆&types=080601&offset=1&citylimit=true
    requests_counter = 0
    tosupply_dic_len = len(tosupply_dic)
    tosupply_dic_len_ = tosupply_dic_len - 1
    
    
    # thread_strat_file_line_num
    def supply_dic(thread_strat_file_line_num):
        global tosupply_dic, requests_counter, tosupply_dic_len_, exception_line_num_list
        for loop in range(0, EACH_THREAD_REQUEST_NUM, 1):
            file_line_num = thread_strat_file_line_num + loop
            if file_line_num - 2 > tosupply_dic_len_:
                return
            if file_line_num < 2 or len(tosupply_dic[file_line_num]['district']) > 0:
                if file_line_num in exception_line_num_list:
                    list_index = exception_line_num_list.index(file_line_num)
                    del exception_line_num_list[list_index]
                continue
            dic_ = tosupply_dic[file_line_num]
            city = dic_['city']
            name = dic_['name']
            url = '%s?key=%s&keywords=%s&city=%s%s%s%s' % (URL_TYPE, touse_key, name, city, POI_TYPES, OFFSET, CITYLIMIT)
            if requests_counter % QPS == 0:
                sleep(QPS_TIME_UNIT)
            try:
                r = requests.get(url)
                r_json = r.json()
            except Exception:
                if file_line_num not in exception_line_num_list:
                    exception_line_num_list.append(file_line_num)
                continue
            infocode = r_json['infocode']
            if infocode == '10000':
                count = r_json['count']
                if int(count) > 0:
                    pois_list = r_json['pois']
                    pos_dic = pois_list[0]
                    tosupply_dic[file_line_num]['district'] = pos_dic['adname']
    
                    if len(pos_dic['address']) <= 2:
                        print(pos_dic)
                        print(pos_dic['address'])
                    tosupply_dic[file_line_num]['address'] = pos_dic['address'] if len(pos_dic['address']) > 2 else '高德缺地址'
                    if len(pos_dic['address']) <= 2:
                        print(tosupply_dic[file_line_num]['address'])
                    tosupply_dic[file_line_num]['longitude_latitude'] = pos_dic['location']
                    tosupply_dic[file_line_num]['gd_name'] = pos_dic['name']
                    tosupply_dic[file_line_num]['gd_city'] = pos_dic['cityname']
    
    
                elif int(count) == 0:
                    tosupply_dic[file_line_num]['gd_name'] = 'GD-NO-DATA'
                if file_line_num in exception_line_num_list:
                    list_index = exception_line_num_list.index(file_line_num)
                    del exception_line_num_list[list_index]
            else:
                if file_line_num not in exception_line_num_list:
                    exception_line_num_list.append(file_line_num)
                change_key()
    
    
    MAX_EXCEPTION_URL_NUM = 0
    
    
    def deal_exception_list():
        global exception_line_num_list
        print(exception_line_num_list)
        if len(exception_line_num_list) > MAX_EXCEPTION_URL_NUM:
            for thread_strat_file_line_num in exception_line_num_list:
                supply_dic(thread_strat_file_line_num)
        else:
            return
        deal_exception_list()
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name=''):
            threading.Thread.__init__(self)
            self.name = name
            self.func = func
            self.args = args
    
        def run(self):
            self.func(self.args)
    
    
    def gen_building(str):
        start_ = 0
        end_ = 0
        if str.find('') > -1:
            start_ = str.find('') + 1
        elif str.find('') > -1:
            start_ = str.find('') + 1
        elif str.find('') > -1:
            start_ = str.find('') + 1
    
        if str.find('') > -1:
            end_ = str.find('') + 1
        elif str.find('') > -1:
            end_ = str.find('') + 1
    
        if end_ - start_ > 3:
            return ''.join((list(str)[start_:end_]))
        else:
            return ''
    
    
    def main():
        print('starting at:', ctime())
        threads_list = []
        thread_sum = math.ceil(tosupply_dic_len / EACH_THREAD_REQUEST_NUM)
        print(185, thread_sum)
        for nloop in range(0, thread_sum, 1):
            thread_strat_file_line_num = nloop * EACH_THREAD_REQUEST_NUM
            print(thread_strat_file_line_num)
            thread_instance = MyThread(supply_dic, (thread_strat_file_line_num), supply_dic.__name__)
            threads_list.append(thread_instance)
            # 主进程将在所有非守护进程退出后,退出
        for t in threads_list:
            t.setDaemon = False
            t.start()
        # wait for all thrades to finish
        for t in threads_list:
            t.join()
        # pprint.pprint(res_dic)
    
    
        deal_exception_list()
        for i in exception_line_num_list:
            print('EXCEPTION', i)
        FGEN = '电影院任务列表_20170724_新增列_已计算楼宇.csv'
        fo = open(FGEN, 'w', encoding='utf-8-sig')
        fo.write('序号,地理区域,省份,城市,区域,地址,所属楼宇,经纬度,商圈属性,店铺名,所属院线,is_from_past_line,gd_name,gd_city
    ')
        fo.closed
        fo = open(FGEN, 'a', encoding='utf-8-sig')
        for file_line_num in tosupply_dic:
            if file_line_num == 1:
                continue
            dic_ = tosupply_dic[file_line_num]
            str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s
    ' % (
                dic_['sequence_number'], dic_['area'], dic_['province'], dic_['city'], dic_['district'],
                dic_['address'].replace(',', '  '),
                gen_building(dic_['address']), dic_['longitude_latitude'].replace(',', ' '), dic_['busniess_type'],
                dic_['name'],
                dic_['parent_company'],
                dic_['is_from_past_line'], dic_['gd_name'], dic_['gd_city'])
            fo.write(str)
        fo.closed
    
    
    if __name__ == '__main__':
        main()
    {'id': 'B02DD0R6M6', 'name': '横店电影城(大汉店)', 'type': '体育休闲服务;影剧院;电影院', 'typecode': '080601', 'biz_type': 'cinema', 'address': [], 'location': '113.149267,27.838133', 'tel': '0731-22915555', 'distance': [], 'biz_ext': [], 'pname': '湖南省', 'cityname': '株洲市', 'adname': '芦淞区', 'importance': [], 'shopid': [], 'shopinfo': '0', 'poiweight': []}
    []
    高德缺地址
    {'id': 'B0FFGJYCPN', 'name': '横店电影城(原阳店)', 'type': '体育休闲服务;影剧院;电影院', 'typecode': '080601', 'biz_type': 'cinema', 'address': [], 'location': '113.960307,35.065736', 'tel': '0373-5911199', 'distance': [], 'biz_ext': [], 'pname': '河南省', 'cityname': '新乡市', 'adname': '原阳县', 'importance': [], 'shopid': [], 'shopinfo': '0', 'poiweight': []}
    []
    高德缺地址
    {'id': 'B0FFH0368R', 'name': '横店电影城', 'type': '体育休闲服务;影剧院;电影院', 'typecode': '080601', 'biz_type': 'cinema', 'address': [], 'location': '118.430053,29.862110', 'tel': [], 'distance': [], 'biz_ext': [], 'pname': '安徽省', 'cityname': '黄山市', 'adname': '歙县', 'importance': [], 'shopid': [], 'shopinfo': '0', 'poiweight': []}
    []
    高德缺地址
    [300, 30, 2, 390, 330, 480, 570, 900, 840, 450, 1140, 630, 990, 180, 1050, 90, 240, 360, 720, 750, 690, 1170, 60, 1230, 960, 210, 1200, 930, 510, 150, 600, 870, 1080, 810, 660, 540, 270, 420, 1110, 120, 780, 1020, 1113, 813, 544, 69, 94, 119]
    [390, 330, 570, 840, 1140, 990, 1050, 240, 720, 690, 60, 960, 1200, 510, 600, 1080, 660, 270, 1110, 780, 1113]
        if url in exception_url_list:
            l_index = exception_url_list.index(url)
            print(139, 'del')
            del exception_url_list[l_index]
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name=''):
            threading.Thread.__init__(self)
            self.name = name
            self.func = func
            self.args = args
    
        def run(self):
            self.func(self.args)
    
    
    MAX_EXCEPTION_URL_NUM = 60
    
    
    def deal_exception_url_list():
        global exception_url_list
        if len(exception_url_list) > MAX_EXCEPTION_URL_NUM:
            for url in exception_url_list:
                grab_one_url(url)
        else:
            return
        deal_exception_url_list()
    # -*- coding: UTF-8 -*-
    
    import math
    import random
    import re
    import sys
    import threading
    from time import ctime, sleep
    from lxml import etree
    import pprint
    import requests
    
    BASE_URL = 'https://www.dianping.com/'
    url_district_list = []
    GLUE = 'search'
    f = open('spider_深圳_district_bussi-nav_url_list - 副本.txt', 'r', encoding='utf-8')
    f.closed
    SCALA = 'SCALA'
    for i in f:
        URL_POOL = i.split(SCALA)
        break
    
    URL_POOL_LEN = len(URL_POOL)
    URL_NUM_EACH_THREAD = 1
    # {url:{}}
    res_dic = {}
    
    MAX_PAGE_NUM = 50
    QPS = 30
    request_counter = 0
    QPS_TIME_UNIT = 1
    
    # url = '%s%s%sp%s' % (BASE_URL, GLUE, URL_POOL[1], 3)
    # print(url)
    # r = requests.get(url)
    # html = r.text
    # selector = etree.HTML(html)
    # page_title = selector.xpath('//title/text()')
    # page_Keywords = selector.xpath('//meta[@name="Keywords"]')[0].attrib['content']
    # data_ga_index_1 = selector.xpath('.//a[@data-ga-index="1"]/span/text()')[0]
    # data_ga_index_2 = selector.xpath('.//a[@data-ga-index="2"]/span/text()')[0]
    # data_ga_index_3 = selector.xpath('.//a[@data-ga-index="3"]/span/text()')[0]
    # res_dic[url] = {}
    # res_dic[url]['page_title'] = page_title
    # res_dic[url]['page_Keywords'] = page_Keywords
    # res_dic[url]['data_ga_index_1'] = data_ga_index_1
    # res_dic[url]['data_ga_index_2'] = data_ga_index_2
    # res_dic[url]['data_ga_index_3'] = data_ga_index_3
    #
    # name_l = selector.xpath('.//li[@class=""]//h4/text()')
    #
    # mean_price_l = selector.xpath('.//li[@class=""]//a[@class="mean-price"]/b/text()')
    #
    # flavour_l = selector.xpath('.//li[@class=""]//div[@class="tag-addr"]/a[1]/span/text()')
    #
    # position_l = selector.xpath('.//li[@class=""]//div[@class="tag-addr"]/a[2]/span/text()')
    #
    # address_l = selector.xpath('.//li[@class=""]//div[@class="tag-addr"]/span/text()')
    #
    # len_l = len(name_l)
    # res_dic[url]['items_num'] = len_l
    # res_dic[url]['items_list'] = []
    # len_l_ = len_l - 1
    # for i in range(0, len_l_, 1):
    #     d = {}
    #     d['name'] = name_l[i]
    #     d['mean_price'] = mean_price_l[i]
    #     d['flavour'] = flavour_l[i]
    #     d['position'] = position_l[i]
    #     d['address'] = address_l[i]
    #     res_dic[url]['items_list'].append(d)
    #
    # d = 4
    
    exception_url_list = []
    
    
    def grab_one_url(url):
        global res_dic, request_counter, exception_url_list
        if request_counter % QPS == 0:
            print(36, 'sleep', request_counter)
            sleep(QPS_TIME_UNIT)
    
        request_counter += 1
        try:
            r = requests.get(url)
        except Exception:
            if url not in exception_url_list:
                exception_url_list.append(url)
                print(exception_url_list)
            return
        html = r.text
        selector = etree.HTML(html)
        page_title = selector.xpath('//title/text()')[0]
    
        try:
            page_Keywords = selector.xpath('//meta[@name="Keywords"]')[0].attrib['content'].replace(',', '')
        except Exception:
            if url not in exception_url_list:
                exception_url_list.append(url)
                print(exception_url_list)
            return
    
        data_ga_index_1 = selector.xpath('.//a[@data-ga-index="1"]/span/text()')[0]
        data_ga_index_2 = selector.xpath('.//a[@data-ga-index="2"]/span/text()')[0]
        data_ga_index_3 = selector.xpath('.//a[@data-ga-index="3"]/span/text()')[0]
        res_dic[url] = {}
        res_dic[url]['page_title'] = page_title
        res_dic[url]['page_Keywords'] = page_Keywords
        res_dic[url]['data_ga_index_1'] = data_ga_index_1
        res_dic[url]['data_ga_index_2'] = data_ga_index_2
        res_dic[url]['data_ga_index_3'] = data_ga_index_3
    
        name_l = selector.xpath('.//li[@class=""]//h4/text()')
    
        mean_price_l = selector.xpath('.//li[@class=""]//a[@class="mean-price"]/b/text()')
    
        flavour_l = selector.xpath('.//li[@class=""]//div[@class="tag-addr"]/a[1]/span/text()')
    
        position_l = selector.xpath('.//li[@class=""]//div[@class="tag-addr"]/a[2]/span/text()')
    
        address_l = selector.xpath('.//li[@class=""]//div[@class="tag-addr"]/span/text()')
    
        len_l = len(name_l)
        res_dic[url]['items_num'] = len_l
        res_dic[url]['items_list'] = []
        len_l_ = len_l - 1
        for i in range(0, len_l_, 1):
            d = {}
            d['name'] = name_l[i].replace(',', '')
            d['mean_price'] = mean_price_l[i] if len(mean_price_l) - 1 >= i else '00'
            d['flavour'] = flavour_l[i]
            d['position'] = position_l[i]
            # https://www.dianping.com/search/category/7/10/r12335p1
            d['address'] = address_l[i].replace(',', '')
            res_dic[url]['items_list'].append(d)
    
        if url in exception_url_list:
            l_index = exception_url_list.index(url)
            print(139, 'del')
            del exception_url_list[l_index]
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name=''):
            threading.Thread.__init__(self)
            self.name = name
            self.func = func
            self.args = args
    
        def run(self):
            self.func(self.args)
    
    
    MAX_EXCEPTION_URL_NUM = 60
    
    
    def deal_exception_url_list():
        global exception_url_list
        if len(exception_url_list) > MAX_EXCEPTION_URL_NUM:
            for url in exception_url_list:
                grab_one_url(url)
        else:
            return
        deal_exception_url_list()
    
    
    def main():
        print('starting at:', ctime())
        threads_list = []
        thread_sum = math.ceil(URL_POOL_LEN / URL_NUM_EACH_THREAD)
        for nloop in range(0, URL_POOL_LEN, 1):
            for nnloop in range(1, MAX_PAGE_NUM, 1):
                url = URL_POOL[nloop]
                url = '%s%s%sp%s' % (BASE_URL, GLUE, url, nnloop)
                print(62, url)
                thread_instance = MyThread(grab_one_url, (url), grab_one_url.__name__)
                threads_list.append(thread_instance)
                # 主进程将在所有非守护进程退出后,退出
        for t in threads_list:
            print(70, t)
            t.setDaemon = False
            t.start()
        # wait for all thrades to finish
        for t in threads_list:
            t.join()
        # pprint.pprint(res_dic)
    
    
        deal_exception_url_list()
    
        f_name = 'dzdp_基于区-大商圈的餐馆列表-深圳.csv'
        f = open(f_name, 'w', encoding='utf-8-sig')
        f.write('')
        f.closed
        f = open(f_name, 'a', encoding='utf-8-sig')
        str = 'name,mean_price, flavour, position,address,url,page_title, page_Keywords, data_ga_index_1, data_ga_index_2, data_ga_index_3,
    '
        f.write(str)
        for url in res_dic:
            page_title = res_dic[url]['page_title']
            page_Keywords = res_dic[url]['page_Keywords']
            data_ga_index_1 = res_dic[url]['data_ga_index_1']
            data_ga_index_2 = res_dic[url]['data_ga_index_2']
            data_ga_index_3 = res_dic[url]['data_ga_index_3']
            for d in res_dic[url]['items_list']:
                name = d['name']
                mean_price = d['mean_price']
                flavour = d['flavour']
                position = d['position']
                address = d['address']
                str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s
    ' % (
                    name, mean_price, flavour, position, address, url, page_title, page_Keywords, data_ga_index_1,
                    data_ga_index_2, data_ga_index_3)
                f.write(str)
        f.closed
    
        f_name = 'dzdp_基于区-大商圈的餐馆列表-深圳_EXCEPTION_URL.csv'
        f = open(f_name, 'w', encoding='utf-8-sig')
        f.write('')
        f.closed
        f = open(f_name, 'a', encoding='utf-8-sig')
        for url in exception_url_list:
            f.write(url + '
    ')
        f.closed
    
    
    if __name__ == '__main__':
        main()
    #
    #
    # d = 3
    # url = ('%s%s%sp%s') % (BASE_URL, GLUE, '/category/7/10/r1949', 100)
    # print(url)
    #
    #
    
    #
    #
    # class MyThread(threading.Thread):
    #     def __init__(self, func, args, name=''):
    #         threading.Thread.__init__(self)
    #         self.name = name
    #         self.func = func
    #         self.args = args
    #
    #     def run(self):
    #         self.func(self.args)
    #
    #
    # def main():
    #     print('starting at:', ctime())
    #     threads_list = []
    #     thread_sum = math.ceil(file_line_list_len / file_jump_step_num)
    #     print(thread_sum)
    #     for nloop in range(1, thread_sum, 1):
    #         print(nloop)
    #         thread_instance = MyThread(get_exception_logic_split_loop, (nloop), get_exception_logic_split_loop.__name__)
    #
    #         threads_list.append(thread_instance)
    #     # 主进程将在所有非守护进程退出后,退出
    #     for t in threads_list:
    #         print(t)
    #         t.setDaemon = False
    #         t.start()
    #     # wait for all thrades to finish
    #     for t in threads_list:
    #         t.join()
    #     f_name = 'ALL.csv'
    #     f = open(f_name, 'w', encoding='utf-8-sig')
    #     f.write('')
    #     f.closed
    #     f = open(f_name, 'a', encoding='utf-8-sig')
    #     str = 'city, district, name, address, if_in_business_area, if_in_business_area_criterion,catering_kind, average_price, data_from
    '
    #     f.write(str)
    #     ## city,district,address,name,catering_kind,average_price,data_from
    #     count_write_rows = 0
    #     for i in cater_dic:
    #         city = i
    #         if city == '城市':
    #             continue
    #         for ii in cater_dic[i]:
    #             district = ii
    #             for iii in cater_dic[i][ii]:
    #                 name = iii
    #                 for iv in cater_dic[i][ii][iii]:
    #                     address = iv
    #                     catering_kind = cater_dic[i][ii][iii][iv]['catering_kind']
    #                     average_price = cater_dic[i][ii][iii][iv]['average_price']
    #                     if_in_business_area = cater_dic[i][ii][iii][iv]['if_in_business_area']
    #                     if_in_business_area_criterion = cater_dic[i][ii][iii][iv]['if_in_business_area_criterion']
    #                     data_from = cater_dic[i][ii][iii][iv]['data_from']
    #                     str = '%s,%s,%s,%s,%s,%s,%s,%s,%s
    ' % (
    #                         city, district, name, address, if_in_business_area, if_in_business_area_criterion,
    #                         catering_kind, average_price, data_from)
    #                     f.write(str)
    #                     count_write_rows += 1
    #     f.closed
    #     print(count_write_rows)
    #
    #
    # if __name__ == '__main__':
    #     main()
    #
    #
    #
    #
    #
    #
    #
    #
    # d = 4
    # URL_POOL = f.detach()
    #
    # # {district:bussi-nav:[page]}
    # SPIDER_URL_DISTRICT_DIC = {}
    # for i in f:
    #     d = i.replace('
    ', '').replace(' ', '')
    #     if d.find(GLUE) > -1:
    #         dd = d.split(GLUE)[1].split('"')[0]
    #         SPIDER_URL_DISTRICT_DIC[dd] = []
    # QPS = 50
    # TIME_UNIT = 1
    # qps_counter = 0
    # for k in SPIDER_URL_DISTRICT_DIC:
    #     url = '%s%s%s' % (BASE_URL, GLUE, k)
    #     print(url)
    #     qps_counter += 1
    #     if qps_counter % QPS == 0:
    #         sleep(TIME_UNIT)
    #     try:
    #         r = requests.get(url)
    #         print(r.status_code)
    #         txt = r.text.replace('
    ', '').replace(' ', '').split('
    ')
    #         start_flag = 0
    #         for d in txt:
    #             if d.find('id="bussi-nav') > -1:
    #                 start_flag = 1
    #             else:
    #                 if start_flag == 1:
    #                     if d.find('/div') > -1:
    #                         start_flag = 0
    #                     else:
    #                         dd = d.split(GLUE)[1].split('"')[0]
    #                         SPIDER_URL_DISTRICT_DIC[k].append(dd)
    #                         print(dd)
    #     except Exception:
    #         # 修改为,2个函数,递归请求
    #         print('EXCEPTION', url)
    #         print(Exception)
    #
    # SCALA = 'SCALA'
    # str = ''
    # for k in SPIDER_URL_DISTRICT_DIC:
    #     for url in SPIDER_URL_DISTRICT_DIC[k]:
    #         str += SCALA + url
    # print(str)
    # str = str[5:]
    # print(str)
    # f = open('spider_深圳_district_bussi-nav_url_list.txt', 'w', encoding='utf-8')
    # f.write(str)
    # f.closed
    #
    # d = 5
    #
    # d = 3
    #
    # url = 'https://www.dianping.com/search/category/7/10/r29'
    # r = requests.get(url)
    #
    # d = 5
    # ZHITONGZI_CITY_DIC = {}
    # f = open('直筒子市_东莞中山.txt', 'r', encoding='utf-8')
    # ZHITONGZI_CITY_DIC['东莞市'] = []
    # ZHITONGZI_CITY_DIC['中山市'] = []
    # c = 0
    # for i in f:
    #     ii = i.split(';')
    #     for iii in ii:
    #         iv = iii.split('、')
    #         if len(iv) > 2:
    #             c += 1
    #             for v in iv:
    #                 if v.find('(') > -1:
    #                     v_ = v.split('(')[1]
    #                 elif v.find(')') > -1:
    #                     v_ = v.split(')')[0]
    #                 else:
    #                     v_ = v
    #                 if c == 1 or c == 2:
    #                     ZHITONGZI_CITY_DIC['东莞市'].append(v_)
    #                 elif c == 3 or c == 4:
    #                     ZHITONGZI_CITY_DIC['中山市'].append(v_)
    # f.closed
    #
    #
    # def chk_is_coffee(str):
    #     l_ = ['coffee', 'coffe', 'cafe', 'café', 'starbucks', 'caffé']
    #     # 上岛花式铁板烧    日本菜
    #     # 泛太平洋大酒店面馆    其他
    #     l_b = ['咖啡', '星巴克']
    #     # 星巴克
    #     for i in l_:
    #         if str.upper().find(i.upper()) != -1:
    #             return True
    #     for i in l_b:
    #         if str.find(i) != -1:
    #             return True
    #     return False
    #
    #
    # def chk_kfc_mdl(str):
    #     if str.find(u"麦当劳") != -1:
    #         return 1
    #     elif str.find(u"肯德基") != -1 or str.upper().find(u"KFC") != -1:
    #         return 0
    #     else:
    #         return 2
    #
    #
    # def get_name(str):
    #     if str.find("麦当劳") != -1:
    #         return '麦当劳'
    #     elif str.find("肯德基") != -1 or str.upper().find(u"KFC") != -1:
    #         return '肯德基'
    #     else:
    #         # str = '狗不理包子(前门店)'
    #         # str =  '(清真)三羊水饺(新民路店)'
    #         # | 添椒 | 潮涮三国IP火锅
    #         if str.find('(') == -1 and str.find('(') == -1:
    #             return str
    #         res = str.strip(' ').split('(')[0].strip(' ')
    #         if len(res) == 0:
    #             try:
    #                 res = str.split(')')[1].split('(')[0]
    #             except Exception:
    #                 print(Exception)
    #         # 一锅两头牛(烟青路店)
    #         res_b = res
    #         try:
    #             res_b = res.split('(')[0]
    #         except Exception:
    #             print(Exception)
    #
    #         return res_b
    #
    #
    # def chk_city_district(str):
    #     city_district = str.replace(' ', '')
    #     if re.match(r".*[a-zA-Z0-9]", city_district) is not None:
    #         return False
    #     elif str.find("[") != -1 or str.find("(") != -1 or str.find(")") != -1:
    #         return False
    #     else:
    #         return city_district
    #
    #
    # def chk_catering_kind(str):
    #     catering_kind = str.replace(' ', '')
    #     if re.match(r".*[0-9]", catering_kind) is not None:
    #         return False
    #     else:
    #         return catering_kind
    #
    #
    # # ['a','',' ']
    # def chk_list_thickness(list_):
    #     if len(list_) == 0:
    #         return False
    #     res_list = []
    #     for i in list_:
    #         i_b = i.replace(' ', '')
    #         if i.replace(' ', '') == '':
    #             return False
    #         else:
    #             res_list.append(i_b)
    #     return res_list
    #
    #
    # business_area_tag_list = ['大厦', '大楼', '大厦', '百货', '购物中心', '商业中心', 'MALL', '广场', '商场', '单元', '栋', '座', '楼', '层', '底商']
    #
    #
    # def chk_in_business_area(str):
    #     global business_area_tag_list
    #     for i in business_area_tag_list:
    #         if str.find(i) > -1:
    #             return 1
    #     return 0
    #
    #
    # # MAX_OFFSET = 25
    # # OFFSET = MAX_OFFSET - 1
    # MAX_PAGINATION = 100
    # pagination = MAX_PAGINATION
    # QPS = 50
    # TIME_UNIT = 1
    # # http://lbs.amap.com/api/webservice/guide/tools/info
    # INFOCODE_OK = '10000'
    # file_name_key_pool = 'key_pool.pool'
    # KEY_POOL_LIST = []
    # touse_key = ''
    # f = open(file_name_key_pool, 'r', encoding='utf-8')
    # for i in f:
    #     try:
    #         list_ = i.split('	')
    #         key = i.split('	')[1].split()
    #         KEY_POOL_LIST.append(key[0])
    #     except Exception:
    #         print(Exception)
    # KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST) - 1
    #
    # # 北京市    西城区    金堂羊蝎子火锅    真武庙四条1号
    # # http://restapi.amap.com/v3/place/around?parameters
    # URL_TYPE = 'http://restapi.amap.com/v3/place/text'
    # # URL_TYPE = 'http://restapi.amap.com/v3/around'
    # touse_key = ''
    # RADIUS = '&radius=20'
    # keywords = '&keywords='
    # OFFSET = '&offset=10'
    # CITYLIMIT = '&citylimit=true'
    #
    # URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
    #
    # change_key_qps = 0
    #
    #
    # def change_key():
    #     global touse_key, change_key_qps
    #
    #     # 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
    #     mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
    #     for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
    #         key = KEY_POOL_LIST[i]
    #         if key == touse_key:
    #             if i == KEY_POOL_NUM_INDICATOR:
    #                 change_key()
    #                 return
    #             else:
    #                 continue
    #         touse_key = key
    #         url = URL_FOR_CHANGE_KEY % (touse_key)
    #         try:
    #             change_key_qps += 1
    #             if change_key_qps % QPS == 0:
    #                 sleep(TIME_UNIT)
    #             r = requests.get(url)
    #             json_ = r.json()
    #         except Exception:
    #             print('requests.get(url)', Exception)
    #             change_key()
    #             return
    #         infocode = json_['infocode']
    #         if not infocode == INFOCODE_OK:
    #             if i == KEY_POOL_NUM_INDICATOR:
    #                 sys.exit('NOInvalidKEY')
    #             change_key()
    #             return
    #         return
    #
    #
    #
    #         # 060101    购物服务    商场    购物中心
    #
    #
    # FILTER_GD_BUSINESS_AREA_TYPE_LIST = ['购物中心']
    #
    #
    # def fliter_gd_business_area_type(url):
    #     global FILTER_GD_BUSINESS_AREA_TYPE_LIST
    #     # {"suggestion":{"keywords":{},"cities":{}},"info":"OK","status":"1","count":"1","pois":[{"typecode":"050118","adname":"西城区","biz_type":"diner","id":"B0FFG8RYC7","pname":"北京市","importance":{},"location":"116.393715,39.957242","distance":{},"tel":"18600185618","biz_ext":{},"shopid":{},"address":"德胜街道六铺炕北小街8-1号","poiweight":{},"cityname":"北京市","type":"餐饮服务;中餐厅;特色/地方风味餐厅","shopinfo":"0","name":"炙热青春"}],"infocode":"10000"
    #     try:
    #         r = requests.get(url)
    #         r_json = r.json()
    #     except Exception:
    #         print(203, Exception)
    #         # 返回数据解析json异常
    #         return 3
    #     infocode = r_json['infocode']
    #     if infocode == '10000':
    #         count = r_json['count']
    #         if int(count) > 0:
    #             pois_list = r_json['pois']
    #             for l in pois_list:
    #                 type = l['type']
    #                 for chk_type in FILTER_GD_BUSINESS_AREA_TYPE_LIST:
    #                     if type.find(chk_type) > -1:
    #                         return 1
    #     else:
    #         change_key()
    #     return 0
    #
    #
    # # f = open('MEITUAN_DAZHONG_20170704.csv', 'r', encoding='gbk')
    # # f_2 = open('MEITUAN_DAZHONG_20170705.csv', 'r', encoding='gbk')
    # # f_gd = open('GD_POI_KFC_MC.csv', 'r', encoding='utf-8-sig')
    #
    # # 市key-区key-品牌名key-地址key-{菜类,均价,data_from}
    # # {city:{district:{name:{address:{catering_kind,average_price,data_from}}}}}
    # cater_dic = {}
    # # [{city,district,address,name,catering_kind,average_price,data_from}]
    # cater_exception_list = []
    # count_catering = 0
    # count_catering_exception = 0
    #
    # coffee_list = []
    # count_coffee = 0
    #
    # fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r')
    #
    # file_line_list = []
    # for i in fo:
    #     file_line_list.append(i)
    # fo.closed
    # #
    # # fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r')
    # # while fo.readline():
    # #     file_line_list_b.append(fo.readline())
    # file_line_list_len = len(file_line_list)
    # file_jump_step_num = 2000
    # count_catering_exception = 0
    # count_coffee = 0
    # count_catering = 0
    #
    #
    # def get_exception_logic_split_loop(nloop):
    #     global touse_key, cater_dic, file_line_list, file_line_list_len, file_jump_step_num, count_catering_exception, count_coffee, count_catering
    #     start_line = nloop * file_jump_step_num
    #     if start_line >= file_line_list_len:
    #         print('last-line')
    #         return
    #     else:
    #         start_line_count = 0
    #         end_line = start_line + file_jump_step_num
    #         if end_line >= file_line_list_len:
    #             end_line = file_line_list_len - 1
    #         for i in range(start_line, end_line, 1):
    #             l_ = file_line_list[i].replace('
    ', '').split(',')
    #             city = l_[0]
    #             district = l_[1]
    #             address = l_[2]
    #             name = l_[3]
    #             average_price = l_[4]
    #             catering_kind = l_[5]
    #             data_from = 'mtdz_5'
    #             # 数据准备层
    #             # 数据运算层
    #             # 该层处理从目标文件取出的字段列表
    #             focus_list = [city, district, address, name, catering_kind, average_price, data_from]
    #             dic_exception = {}
    #             dic_exception['data_from'] = data_from
    #             dic_exception['city'] = city
    #             dic_exception['district'] = district
    #             dic_exception['name'] = name
    #             dic_exception['address'] = address
    #             dic_exception['catering_kind'] = catering_kind
    #             dic_exception['average_price'] = average_price
    #
    #             if not chk_list_thickness(focus_list) or not chk_city_district(city) or not chk_city_district(
    #                     district) or not chk_catering_kind(catering_kind):
    #                 count_catering_exception += 1
    #                 cater_exception_list.append(dic_exception)
    #             else:
    #                 name = get_name(name)
    #
    #                 m = chk_is_coffee(name)
    #                 # if m:
    #                 #     print(list_)
    #                 if not m:
    #                     m = chk_is_coffee(catering_kind)
    #                 if m:
    #                     count_coffee += 1
    #                     coffee_list.append(dic_exception)
    #
    #                 if not m:
    #                     dic_details = {}
    #                     dic_details['data_from'] = data_from
    #                     dic_details['catering_kind'] = catering_kind
    #                     dic_details['average_price'] = average_price
    #                     if_in_business_area = chk_in_business_area(address)
    #                     if_in_business_area_criterion = 'str_match'
    #                     if if_in_business_area == 0:
    #                         city_r = '&city=' + district
    #                         keywords = '&keywords=' + address + '|' + name
    #                         start_line_count += 1
    #                         print(start_line, start_line_count)
    #                         if start_line_count % QPS == 0:
    #                             print('sleep')
    #                             sleep(1)
    #                         url = URL_TYPE + '?' + 'key=' + touse_key + RADIUS + keywords + city_r + CITYLIMIT
    #
    #                         if_in_business_area = fliter_gd_business_area_type(url)
    #                         if_in_business_area_criterion = 'str_match+request_api'
    #
    #                     dic_details['if_in_business_area_criterion'] = if_in_business_area_criterion
    #                     dic_details['if_in_business_area'] = if_in_business_area
    #
    #                     if city not in cater_dic:
    #                         cater_dic[city] = {}
    #                     if district not in cater_dic[city]:
    #                         cater_dic[city][district] = {}
    #                     if name not in cater_dic[city][district]:
    #                         cater_dic[city][district][name] = {}
    #                     if address not in cater_dic[city][district][name]:
    #                         cater_dic[city][district][name][address] = {}
    #
    #                     cater_dic[city][district][name][address] = dic_details
    #                     count_catering += 1
    #
    #
    # class MyThread(threading.Thread):
    #     def __init__(self, func, args, name=''):
    #         threading.Thread.__init__(self)
    #         self.name = name
    #         self.func = func
    #         self.args = args
    #
    #     def run(self):
    #         self.func(self.args)
    #
    #
    # def main():
    #     print('starting at:', ctime())
    #     threads_list = []
    #     thread_sum = math.ceil(file_line_list_len / file_jump_step_num)
    #     print(thread_sum)
    #     for nloop in range(1, thread_sum, 1):
    #         print(nloop)
    #         thread_instance = MyThread(get_exception_logic_split_loop, (nloop), get_exception_logic_split_loop.__name__)
    #
    #         threads_list.append(thread_instance)
    #     # 主进程将在所有非守护进程退出后,退出
    #     for t in threads_list:
    #         print(t)
    #         t.setDaemon = False
    #         t.start()
    #     # wait for all thrades to finish
    #     for t in threads_list:
    #         t.join()
    #     f_name = 'ALL.csv'
    #     f = open(f_name, 'w', encoding='utf-8-sig')
    #     f.write('')
    #     f.closed
    #     f = open(f_name, 'a', encoding='utf-8-sig')
    #     str = 'city, district, name, address, if_in_business_area, if_in_business_area_criterion,catering_kind, average_price, data_from
    '
    #     f.write(str)
    #     ## city,district,address,name,catering_kind,average_price,data_from
    #     count_write_rows = 0
    #     for i in cater_dic:
    #         city = i
    #         if city == '城市':
    #             continue
    #         for ii in cater_dic[i]:
    #             district = ii
    #             for iii in cater_dic[i][ii]:
    #                 name = iii
    #                 for iv in cater_dic[i][ii][iii]:
    #                     address = iv
    #                     catering_kind = cater_dic[i][ii][iii][iv]['catering_kind']
    #                     average_price = cater_dic[i][ii][iii][iv]['average_price']
    #                     if_in_business_area = cater_dic[i][ii][iii][iv]['if_in_business_area']
    #                     if_in_business_area_criterion = cater_dic[i][ii][iii][iv]['if_in_business_area_criterion']
    #                     data_from = cater_dic[i][ii][iii][iv]['data_from']
    #                     str = '%s,%s,%s,%s,%s,%s,%s,%s,%s
    ' % (
    #                         city, district, name, address, if_in_business_area, if_in_business_area_criterion,
    #                         catering_kind, average_price, data_from)
    #                     f.write(str)
    #                     count_write_rows += 1
    #     f.closed
    #     print(count_write_rows)
    #
    #
    # if __name__ == '__main__':
    #     main()

    82000 277
    186000 345
    42000 251
    186000 346
    186000 347
    42000 252
    82000 278
    42000 253
    42000 254
    40000 346
    40000 347
    42000 255
    40000 348
    42000 256
    40000 349
    82000 279
    40000 350
    sleep
    72000 279
    12000 350
    sleep
    72000 280
    72000 281
    72000 282
    96000 274
    72000 283
    96000 275
    186000 348
    72000 284
    186000 349
    106000 275
    132000 328
    166000 298
    188000 372
    60000 336
    60000 337
    60000 338
    60000 339
    60000 340
    82000 280
    42000 257
    82000 281
    82000 282
    60000 341
    186000 350
    sleep
    96000 276
    72000 285
    72000 286
    40000 351
    72000 287
    96000 277
    96000 278
    72000 288
    72000 289
    96000 279
    72000 290
    96000 280
    72000 291
    96000 281
    72000 292
    2000 371
    96000 282
    102000 255

    # -*- coding: UTF-8 -*-
    
    import re
    import pprint
    import json
    import time
    import math
    import sys
    import requests
    import threading
    from time import ctime, sleep
    import random
    
    ZHITONGZI_CITY_DIC = {}
    f = open('直筒子市_东莞中山.txt', 'r', encoding='utf-8')
    ZHITONGZI_CITY_DIC['东莞市'] = []
    ZHITONGZI_CITY_DIC['中山市'] = []
    c = 0
    for i in f:
        ii = i.split('')
        for iii in ii:
            iv = iii.split('')
            if len(iv) > 2:
                c += 1
                for v in iv:
                    if v.find('') > -1:
                        v_ = v.split('')[1]
                    elif v.find('') > -1:
                        v_ = v.split('')[0]
                    else:
                        v_ = v
                    if c == 1 or c == 2:
                        ZHITONGZI_CITY_DIC['东莞市'].append(v_)
                    elif c == 3 or c == 4:
                        ZHITONGZI_CITY_DIC['中山市'].append(v_)
    f.closed
    
    
    def chk_is_coffee(str):
        l_ = ['coffee', 'coffe', 'cafe', 'café', 'starbucks', 'caffé']
        # 上岛花式铁板烧    日本菜
        # 泛太平洋大酒店面馆    其他
        l_b = ['咖啡', '星巴克']
        # 星巴克
        for i in l_:
            if str.upper().find(i.upper()) != -1:
                return True
        for i in l_b:
            if str.find(i) != -1:
                return True
        return False
    
    
    def chk_kfc_mdl(str):
        if str.find(u"麦当劳") != -1:
            return 1
        elif str.find(u"肯德基") != -1 or str.upper().find(u"KFC") != -1:
            return 0
        else:
            return 2
    
    
    def get_name(str):
        if str.find("麦当劳") != -1:
            return '麦当劳'
        elif str.find("肯德基") != -1 or str.upper().find(u"KFC") != -1:
            return '肯德基'
        else:
            # str = '狗不理包子(前门店)'
            # str =  '(清真)三羊水饺(新民路店)'
            # | 添椒 | 潮涮三国IP火锅
            if str.find('') == -1 and str.find('(') == -1:
                return str
            res = str.strip(' ').split('')[0].strip(' ')
            if len(res) == 0:
                try:
                    res = str.split('')[1].split('(')[0]
                except Exception:
                    print(Exception)
            # 一锅两头牛(烟青路店)
            res_b = res
            try:
                res_b = res.split('(')[0]
            except Exception:
                print(Exception)
    
            return res_b
    
    
    def chk_city_district(str):
        city_district = str.replace(' ', '')
        if re.match(r".*[a-zA-Z0-9]", city_district) is not None:
            return False
        elif str.find("[") != -1 or str.find("(") != -1 or str.find(")") != -1:
            return False
        else:
            return city_district
    
    
    def chk_catering_kind(str):
        catering_kind = str.replace(' ', '')
        if re.match(r".*[0-9]", catering_kind) is not None:
            return False
        else:
            return catering_kind
    
    
    # ['a','',' ']
    def chk_list_thickness(list_):
        if len(list_) == 0:
            return False
        res_list = []
        for i in list_:
            i_b = i.replace(' ', '')
            if i.replace(' ', '') == '':
                return False
            else:
                res_list.append(i_b)
        return res_list
    
    
    business_area_tag_list = ['大厦', '大楼', '大厦', '百货', '购物中心', '商业中心', 'MALL', '广场', '商场', '单元', '', '', '', '', '底商']
    
    
    def chk_in_business_area(str):
        global business_area_tag_list
        for i in business_area_tag_list:
            if str.find(i) > -1:
                return 1
        return 0
    
    
    # MAX_OFFSET = 25
    # OFFSET = MAX_OFFSET - 1
    MAX_PAGINATION = 100
    pagination = MAX_PAGINATION
    QPS = 50
    TIME_UNIT = 1
    # http://lbs.amap.com/api/webservice/guide/tools/info
    INFOCODE_OK = '10000'
    file_name_key_pool = 'key_pool.pool'
    KEY_POOL_LIST = []
    touse_key = ''
    f = open(file_name_key_pool, 'r', encoding='utf-8')
    for i in f:
        try:
            list_ = i.split('	')
            key = i.split('	')[1].split()
            KEY_POOL_LIST.append(key[0])
        except Exception:
            print(Exception)
    KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST) - 1
    
    # 北京市    西城区    金堂羊蝎子火锅    真武庙四条1号
    # http://restapi.amap.com/v3/place/around?parameters
    URL_TYPE = 'http://restapi.amap.com/v3/place/text'
    # URL_TYPE = 'http://restapi.amap.com/v3/around'
    touse_key = ''
    RADIUS = '&radius=20'
    keywords = '&keywords='
    OFFSET = '&offset=10'
    CITYLIMIT = '&citylimit=true'
    
    URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
    
    change_key_qps = 0
    
    
    def change_key():
        global touse_key, change_key_qps
    
        # 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
        mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
        for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
            key = KEY_POOL_LIST[i]
            if key == touse_key:
                if i == KEY_POOL_NUM_INDICATOR:
                    change_key()
                    return
                else:
                    continue
            touse_key = key
            url = URL_FOR_CHANGE_KEY % (touse_key)
            try:
                change_key_qps += 1
                if change_key_qps % QPS == 0:
                    sleep(TIME_UNIT)
                r = requests.get(url)
                json_ = r.json()
            except Exception:
                print('requests.get(url)', Exception)
                change_key()
                return
            infocode = json_['infocode']
            if not infocode == INFOCODE_OK:
                if i == KEY_POOL_NUM_INDICATOR:
                    sys.exit('NOInvalidKEY')
                change_key()
                return
            return
    
    
    
        # 060101    购物服务    商场    购物中心
    FILTER_GD_BUSINESS_AREA_TYPE_LIST = ['购物中心']
    
    
    def fliter_gd_business_area_type(url):
        global FILTER_GD_BUSINESS_AREA_TYPE_LIST
        # {"suggestion":{"keywords":{},"cities":{}},"info":"OK","status":"1","count":"1","pois":[{"typecode":"050118","adname":"西城区","biz_type":"diner","id":"B0FFG8RYC7","pname":"北京市","importance":{},"location":"116.393715,39.957242","distance":{},"tel":"18600185618","biz_ext":{},"shopid":{},"address":"德胜街道六铺炕北小街8-1号","poiweight":{},"cityname":"北京市","type":"餐饮服务;中餐厅;特色/地方风味餐厅","shopinfo":"0","name":"炙热青春"}],"infocode":"10000"
        try:
            r = requests.get(url)
            r_json = r.json()
        except Exception:
            print(203, Exception)
            # 返回数据解析json异常
            return 3
        infocode = r_json['infocode']
        if infocode == '10000':
            count = r_json['count']
            if int(count) > 0:
                pois_list = r_json['pois']
                for l in pois_list:
                    type = l['type']
                    for chk_type in FILTER_GD_BUSINESS_AREA_TYPE_LIST:
                        if type.find(chk_type) > -1:
                            return 1
        else:
            change_key()
        return 0
    
    
    # f = open('MEITUAN_DAZHONG_20170704.csv', 'r', encoding='gbk')
    # f_2 = open('MEITUAN_DAZHONG_20170705.csv', 'r', encoding='gbk')
    # f_gd = open('GD_POI_KFC_MC.csv', 'r', encoding='utf-8-sig')
    
    # 市key-区key-品牌名key-地址key-{菜类,均价,data_from}
    # {city:{district:{name:{address:{catering_kind,average_price,data_from}}}}}
    cater_dic = {}
    # [{city,district,address,name,catering_kind,average_price,data_from}]
    cater_exception_list = []
    count_catering = 0
    count_catering_exception = 0
    
    coffee_list = []
    count_coffee = 0
    
    fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r')
    
    file_line_list = []
    for i in fo:
        file_line_list.append(i)
    fo.closed
    #
    # fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r')
    # while fo.readline():
    #     file_line_list_b.append(fo.readline())
    file_line_list_len = len(file_line_list)
    file_jump_step_num = 2000
    count_catering_exception = 0
    count_coffee = 0
    count_catering = 0
    
    
    def get_exception_logic_split_loop(nloop):
        global touse_key, cater_dic, file_line_list, file_line_list_len, file_jump_step_num, count_catering_exception, count_coffee, count_catering
        start_line = nloop * file_jump_step_num
        if start_line >= file_line_list_len:
            print('last-line')
            return
        else:
            start_line_count = 0
            end_line = start_line + file_jump_step_num
            if end_line >= file_line_list_len:
                end_line = file_line_list_len - 1
            for i in range(start_line, end_line, 1):
                l_ = file_line_list[i].replace('
    ', '').split(',')
                city = l_[0]
                district = l_[1]
                address = l_[2]
                name = l_[3]
                average_price = l_[4]
                catering_kind = l_[5]
                data_from = 'mtdz_5'
                # 数据准备层
                # 数据运算层
                # 该层处理从目标文件取出的字段列表
                focus_list = [city, district, address, name, catering_kind, average_price, data_from]
                dic_exception = {}
                dic_exception['data_from'] = data_from
                dic_exception['city'] = city
                dic_exception['district'] = district
                dic_exception['name'] = name
                dic_exception['address'] = address
                dic_exception['catering_kind'] = catering_kind
                dic_exception['average_price'] = average_price
    
                if not chk_list_thickness(focus_list) or not chk_city_district(city) or not chk_city_district(
                        district) or not chk_catering_kind(catering_kind):
                    count_catering_exception += 1
                    cater_exception_list.append(dic_exception)
                else:
                    name = get_name(name)
    
                    m = chk_is_coffee(name)
                    # if m:
                    #     print(list_)
                    if not m:
                        m = chk_is_coffee(catering_kind)
                    if m:
                        count_coffee += 1
                        coffee_list.append(dic_exception)
    
                    if not m:
                        dic_details = {}
                        dic_details['data_from'] = data_from
                        dic_details['catering_kind'] = catering_kind
                        dic_details['average_price'] = average_price
                        if_in_business_area = chk_in_business_area(address)
                        if_in_business_area_criterion = 'str_match'
                        if if_in_business_area == 0:
                            city_r = '&city=' + district
                            keywords = '&keywords=' + address + '|' + name
                            start_line_count += 1
                            print( start_line, start_line_count)
                            if start_line_count % QPS == 0:
                                print('sleep')
                                sleep(1)
                            url = URL_TYPE + '?' + 'key=' + touse_key + RADIUS + keywords + city_r + CITYLIMIT
    
                            if_in_business_area = fliter_gd_business_area_type(url)
                            if_in_business_area_criterion = 'str_match+request_api'
    
                        dic_details['if_in_business_area_criterion'] = if_in_business_area_criterion
                        dic_details['if_in_business_area'] = if_in_business_area
    
                        if city not in cater_dic:
                            cater_dic[city] = {}
                        if district not in cater_dic[city]:
                            cater_dic[city][district] = {}
                        if name not in cater_dic[city][district]:
                            cater_dic[city][district][name] = {}
                        if address not in cater_dic[city][district][name]:
                            cater_dic[city][district][name][address] = {}
    
                        cater_dic[city][district][name][address] = dic_details
                        count_catering += 1
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name=''):
            threading.Thread.__init__(self)
            self.name = name
            self.func = func
            self.args = args
    
        def run(self):
            self.func(self.args)
    
    
    def main():
        print('starting at:', ctime())
        threads_list = []
        thread_sum = math.ceil(file_line_list_len / file_jump_step_num)
        print(thread_sum)
        for nloop in range(1, thread_sum, 1):
            print(nloop)
            thread_instance = MyThread(get_exception_logic_split_loop, (nloop), get_exception_logic_split_loop.__name__)
    
            threads_list.append(thread_instance)
        # 主进程将在所有非守护进程退出后,退出
        for t in threads_list:
            print(t)
            t.setDaemon = False
            t.start()
        # wait for all thrades to finish
        for t in threads_list:
            t.join()
        f_name = 'ALL.csv'
        f = open(f_name, 'w', encoding='utf-8-sig')
        f.write('')
        f.closed
        f = open(f_name, 'a', encoding='utf-8-sig')
        str = 'city, district, name, address, if_in_business_area, if_in_business_area_criterion,catering_kind, average_price, data_from
    '
        f.write(str)
        ## city,district,address,name,catering_kind,average_price,data_from
        count_write_rows = 0
        for i in cater_dic:
            city = i
            if city == '城市':
                continue
            for ii in cater_dic[i]:
                district = ii
                for iii in cater_dic[i][ii]:
                    name = iii
                    for iv in cater_dic[i][ii][iii]:
                        address = iv
                        catering_kind = cater_dic[i][ii][iii][iv]['catering_kind']
                        average_price = cater_dic[i][ii][iii][iv]['average_price']
                        if_in_business_area = cater_dic[i][ii][iii][iv]['if_in_business_area']
                        if_in_business_area_criterion = cater_dic[i][ii][iii][iv]['if_in_business_area_criterion']
                        data_from = cater_dic[i][ii][iii][iv]['data_from']
                        str = '%s,%s,%s,%s,%s,%s,%s,%s,%s
    ' % (
                            city, district, name, address, if_in_business_area, if_in_business_area_criterion,
                            catering_kind, average_price, data_from)
                        f.write(str)
                        count_write_rows += 1
        f.closed
        print(count_write_rows)
    
    
    if __name__ == '__main__':
        main()
    # -*- coding: UTF-8 -*-
    
    import re
    import pprint
    import json
    import time
    import math
    import sys
    import requests
    import threading
    from time import ctime, sleep
    import random
    
    ZHITONGZI_CITY_DIC = {}
    f = open('直筒子市_东莞中山.txt', 'r', encoding='utf-8')
    ZHITONGZI_CITY_DIC['东莞市'] = []
    ZHITONGZI_CITY_DIC['中山市'] = []
    c = 0
    for i in f:
        ii = i.split('')
        for iii in ii:
            iv = iii.split('')
            if len(iv) > 2:
                c += 1
                for v in iv:
                    if v.find('') > -1:
                        v_ = v.split('')[1]
                    elif v.find('') > -1:
                        v_ = v.split('')[0]
                    else:
                        v_ = v
                    if c == 1 or c == 2:
                        ZHITONGZI_CITY_DIC['东莞市'].append(v_)
                    elif c == 3 or c == 4:
                        ZHITONGZI_CITY_DIC['中山市'].append(v_)
    f.closed
    
    
    def chk_is_coffee(str):
        l_ = ['coffee', 'coffe', 'cafe', 'café', 'starbucks', 'caffé']
        # 上岛花式铁板烧    日本菜
        # 泛太平洋大酒店面馆    其他
        l_b = ['咖啡', '星巴克']
        # 星巴克
        for i in l_:
            if str.upper().find(i.upper()) != -1:
                return True
        for i in l_b:
            if str.find(i) != -1:
                return True
        return False
    
    
    def chk_kfc_mdl(str):
        if str.find(u"麦当劳") != -1:
            return 1
        elif str.find(u"肯德基") != -1 or str.upper().find(u"KFC") != -1:
            return 0
        else:
            return 2
    
    
    def get_name(str):
        if str.find("麦当劳") != -1:
            return '麦当劳'
        elif str.find("肯德基") != -1 or str.upper().find(u"KFC") != -1:
            return '肯德基'
        else:
            # str = '狗不理包子(前门店)'
            # str =  '(清真)三羊水饺(新民路店)'
            # | 添椒 | 潮涮三国IP火锅
            if str.find('') == -1 and str.find('(') == -1:
                return str
            res = str.strip(' ').split('')[0].strip(' ')
            if len(res) == 0:
                try:
                    res = str.split('')[1].split('(')[0]
                except Exception:
                    print(Exception)
            # 一锅两头牛(烟青路店)
            res_b = res
            try:
                res_b = res.split('(')[0]
            except Exception:
                print(Exception)
    
            return res_b
    
    
    def chk_city_district(str):
        city_district = str.replace(' ', '')
        if re.match(r".*[a-zA-Z0-9]", city_district) is not None:
            return False
        elif str.find("[") != -1 or str.find("(") != -1 or str.find(")") != -1:
            return False
        else:
            return city_district
    
    
    def chk_catering_kind(str):
        catering_kind = str.replace(' ', '')
        if re.match(r".*[0-9]", catering_kind) is not None:
            return False
        else:
            return catering_kind
    
    
    # ['a','',' ']
    def chk_list_thickness(list_):
        if len(list_) == 0:
            return False
        res_list = []
        for i in list_:
            i_b = i.replace(' ', '')
            if i.replace(' ', '') == '':
                return False
            else:
                res_list.append(i_b)
        return res_list
    
    
    business_area_tag_list = ['大厦', '大楼', '大厦', '百货', '购物中心', '商业中心', 'MALL', '广场', '商场', '单元', '', '', '', '', '底商']
    
    
    def chk_in_business_area(str):
        global business_area_tag_list
        for i in business_area_tag_list:
            if str.find(i) > -1:
                return 1
        return 0
    
    
    # MAX_OFFSET = 25
    # OFFSET = MAX_OFFSET - 1
    MAX_PAGINATION = 100
    pagination = MAX_PAGINATION
    QPS = 50
    TIME_UNIT = 1
    # http://lbs.amap.com/api/webservice/guide/tools/info
    INFOCODE_OK = '10000'
    file_name_key_pool = 'key_pool.pool'
    KEY_POOL_LIST = []
    touse_key = ''
    f = open(file_name_key_pool, 'r', encoding='utf-8')
    for i in f:
        try:
            list_ = i.split('	')
            key = i.split('	')[1].split()
            KEY_POOL_LIST.append(key[0])
        except Exception:
            print(Exception)
    KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST) - 1
    
    # 北京市    西城区    金堂羊蝎子火锅    真武庙四条1号
    # http://restapi.amap.com/v3/place/around?parameters
    URL_TYPE = 'http://restapi.amap.com/v3/place/text'
    # URL_TYPE = 'http://restapi.amap.com/v3/around'
    touse_key = ''
    RADIUS = '&radius=20'
    keywords = '&keywords='
    OFFSET = '&offset=10'
    CITYLIMIT = '&citylimit=true'
    
    URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
    
    change_key_qps = 0
    
    
    def change_key():
        global touse_key, change_key_qps
    
        # 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
        mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
        for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
            key = KEY_POOL_LIST[i]
            if key == touse_key:
                if i == KEY_POOL_NUM_INDICATOR:
                    change_key()
                    return
                else:
                    continue
            touse_key = key
            url = URL_FOR_CHANGE_KEY % (touse_key)
            try:
                change_key_qps += 1
                if change_key_qps % QPS == 0:
                    sleep(TIME_UNIT)
                r = requests.get(url)
                json_ = r.json()
            except Exception:
                print('requests.get(url)', Exception)
                change_key()
                return
            infocode = json_['infocode']
            if not infocode == INFOCODE_OK:
                if i == KEY_POOL_NUM_INDICATOR:
                    sys.exit('NOInvalidKEY')
                change_key()
                return
            return
    
    
    
        # 060101    购物服务    商场    购物中心
    FILTER_GD_BUSINESS_AREA_TYPE_LIST = ['购物中心']
    
    
    def fliter_gd_business_area_type(url):
        global FILTER_GD_BUSINESS_AREA_TYPE_LIST
        # {"suggestion":{"keywords":{},"cities":{}},"info":"OK","status":"1","count":"1","pois":[{"typecode":"050118","adname":"西城区","biz_type":"diner","id":"B0FFG8RYC7","pname":"北京市","importance":{},"location":"116.393715,39.957242","distance":{},"tel":"18600185618","biz_ext":{},"shopid":{},"address":"德胜街道六铺炕北小街8-1号","poiweight":{},"cityname":"北京市","type":"餐饮服务;中餐厅;特色/地方风味餐厅","shopinfo":"0","name":"炙热青春"}],"infocode":"10000"
        try:
            r = requests.get(url)
            r_json = r.json()
        except Exception:
            print(203, Exception)
            # 返回数据解析json异常
            return 3
        infocode = r_json['infocode']
        if infocode == '10000':
            count = r_json['count']
            if int(count) > 0:
                pois_list = r_json['pois']
                for l in pois_list:
                    type = l['type']
                    for chk_type in FILTER_GD_BUSINESS_AREA_TYPE_LIST:
                        if type.find(chk_type) > -1:
                            return 1
        else:
            change_key()
        return 0
    
    
    # f = open('MEITUAN_DAZHONG_20170704.csv', 'r', encoding='gbk')
    # f_2 = open('MEITUAN_DAZHONG_20170705.csv', 'r', encoding='gbk')
    # f_gd = open('GD_POI_KFC_MC.csv', 'r', encoding='utf-8-sig')
    
    # 市key-区key-品牌名key-地址key-{菜类,均价,data_from}
    # {city:{district:{name:{address:{catering_kind,average_price,data_from}}}}}
    cater_dic = {}
    # [{city,district,address,name,catering_kind,average_price,data_from}]
    cater_exception_list = []
    count_catering = 0
    count_catering_exception = 0
    
    coffee_list = []
    count_coffee = 0
    
    fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r')
    
    file_line_list = []
    for i in fo:
        file_line_list.append(i)
    fo.closed
    #
    # fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r')
    # while fo.readline():
    #     file_line_list_b.append(fo.readline())
    file_line_list_len = len(file_line_list)
    file_jump_step_num = 4000
    count_catering_exception = 0
    count_coffee = 0
    count_catering = 0
    
    
    def get_exception_logic_split_loop(nloop):
        print(247, nloop)
        global touse_key, cater_dic, file_line_list, file_line_list_len, file_jump_step_num, count_catering_exception, count_coffee, count_catering
        start_line = nloop * file_jump_step_num
        if start_line >= file_line_list_len:
            print('last-line')
            return
        else:
            start_line_count = 0
            end_line = start_line + file_jump_step_num
            if end_line >= file_line_list_len:
                end_line = file_line_list_len - 1
            for i in range(start_line, end_line, 1):
                l_ = file_line_list[i].replace('
    ', '').split(',')
                city = l_[0]
                district = l_[1]
                address = l_[2]
                name = l_[3]
                average_price = l_[4]
                catering_kind = l_[5]
                data_from = 'mtdz_5'
                # 数据准备层
                # 数据运算层
                # 该层处理从目标文件取出的字段列表
                focus_list = [city, district, address, name, catering_kind, average_price, data_from]
                dic_exception = {}
                dic_exception['data_from'] = data_from
                dic_exception['city'] = city
                dic_exception['district'] = district
                dic_exception['name'] = name
                dic_exception['address'] = address
                dic_exception['catering_kind'] = catering_kind
                dic_exception['average_price'] = average_price
    
                if not chk_list_thickness(focus_list) or not chk_city_district(city) or not chk_city_district(
                        district) or not chk_catering_kind(catering_kind):
                    count_catering_exception += 1
                    cater_exception_list.append(dic_exception)
                else:
                    name = get_name(name)
    
                    m = chk_is_coffee(name)
                    # if m:
                    #     print(list_)
                    if not m:
                        m = chk_is_coffee(catering_kind)
                    if m:
                        count_coffee += 1
                        coffee_list.append(dic_exception)
    
                    if not m:
                        dic_details = {}
                        dic_details['data_from'] = data_from
                        dic_details['catering_kind'] = catering_kind
                        dic_details['average_price'] = average_price
                        if_in_business_area = chk_in_business_area(address)
                        if_in_business_area_criterion = 'str_match'
                        if if_in_business_area == 0:
                            city_r = '&city=' + district
                            keywords = '&keywords=' + address + '|' + name
                            start_line_count += 1
                            print(417, start_line, start_line_count)
                            if start_line_count % QPS == 0:
                                print('sleep')
                                sleep(1)
                            url = URL_TYPE + '?' + 'key=' + touse_key + RADIUS + keywords + city_r + CITYLIMIT
    
                            if_in_business_area = fliter_gd_business_area_type(url)
                            if_in_business_area_criterion = 'str_match+request_api'
    
                        dic_details['if_in_business_area_criterion'] = if_in_business_area_criterion
                        dic_details['if_in_business_area'] = if_in_business_area
    
                        if city not in cater_dic:
                            cater_dic[city] = {}
                        if district not in cater_dic[city]:
                            cater_dic[city][district] = {}
                        if name not in cater_dic[city][district]:
                            cater_dic[city][district][name] = {}
                        if address not in cater_dic[city][district][name]:
                            cater_dic[city][district][name][address] = {}
    
                        cater_dic[city][district][name][address] = dic_details
                        count_catering += 1
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name=''):
            threading.Thread.__init__(self)
            self.name = name
            self.func = func
            self.args = args
    
        def run(self):
            self.func(self.args)
    
    
    def main():
        print('starting at:', ctime())
        threads_list = []
        thread_sum = math.ceil(file_line_list_len / file_jump_step_num)
        print(thread_sum)
        for nloop in range(1, thread_sum, 1):
            print(nloop)
            thread_instance = MyThread(get_exception_logic_split_loop, (nloop), get_exception_logic_split_loop.__name__)
            print(353, '123')
            threads_list.append(thread_instance)
        # 主进程将在所有非守护进程退出后,退出
        for t in threads_list:
            print(t)
            t.setDaemon = False
            t.start()
        # wait for all thrades to finish
        for t in threads_list:
            t.join()
    
        print(467, cater_dic)
        f_name = 'ALL.csv'
        f = open(f_name, 'w', encoding='utf-8-sig')
        f.write('')
        f.closed
        f = open(f_name, 'a', encoding='utf-8-sig')
        str = 'city, district, name, address, if_in_business_area, if_in_business_area_criterion,catering_kind, average_price, data_from
    '
        f.write(str)
        ## city,district,address,name,catering_kind,average_price,data_from
        count_write_rows = 0
        for i in cater_dic:
            city = i
            if city == '城市':
                continue
            for ii in cater_dic[i]:
                district = ii
                for iii in cater_dic[i][ii]:
                    name = iii
                    for iv in cater_dic[i][ii][iii]:
                        address = iv
                        catering_kind = cater_dic[i][ii][iii][iv]['catering_kind']
                        average_price = cater_dic[i][ii][iii][iv]['average_price']
                        if_in_business_area = cater_dic[i][ii][iii][iv]['if_in_business_area']
                        if_in_business_area_criterion = cater_dic[i][ii][iii][iv]['if_in_business_area_criterion']
                        data_from = cater_dic[i][ii][iii][iv]['data_from']
                        str = '%s,%s,%s,%s,%s,%s,%s,%s,%s
    ' % (
                            city, district, name, address, if_in_business_area, if_in_business_area_criterion,
                            catering_kind, average_price, data_from)
                        f.write(str)
                        count_write_rows += 1
        f.closed
        print(count_write_rows)
    
    
    if __name__ == '__main__':
        main()
    change_key_qps = 0
    
    
    def change_key():
        global touse_key, change_key_qps
        change_key_qps += 1
        if change_key_qps % QPS == 0:
            sleep(TIME_UNIT)
        # 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
        mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
        print(mean_use_key)
        for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
            key = KEY_POOL_LIST[i]
            if key == touse_key:
                if i == KEY_POOL_NUM_INDICATOR:
                    change_key()
                else:
                    continue
            print(172, 'present_key', touse_key)
            touse_key = key
            url = URL_FOR_CHANGE_KEY % (touse_key)
            print(175, 'touse_key', touse_key)
            try:
                r = requests.get(url)
                try:
                    json_ = r.json()
                except Exception:
                    print(' r.json()', Exception)
                    change_key()
            except Exception:
                print('requests.get(url)', Exception)
                change_key()
            infocode = json_['infocode']
            if not infocode == INFOCODE_OK:
                if i == KEY_POOL_NUM_INDICATOR:
                    sys.exit('NOInvalidKEY')
                change_key()
    

      

    高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;


    2
    172 present_key c5ef87ab7efe0d76b970fd330bf9e7f2
    175 touse_key adf8e13d1b170fcef7132ea3178a2d6c
    172 present_key adf8e13d1b170fcef7132ea3178a2d6c
    175 touse_key 2f3d41dfbce352fc4d82009c552505fe
    172 present_key 2f3d41dfbce352fc4d82009c552505fe
    175 touse_key c5ef87ab7efe0d76b970fd330bf9e7f2
    172 present_key c5ef87ab7efe0d76b970fd330bf9e7f2
    175 touse_key 2f3d41dfbce352fc4d82009c552505fe
    172 present_key 2f3d41dfbce352fc4d82009c552505fe
    175 touse_key c5ef87ab7efe0d76b970fd330bf9e7f2
    172 present_key c5ef87ab7efe0d76b970fd330bf9e7f2
    175 touse_key 6d95ab3f63c494911002c1734089548a
    6
    172 present_key 6d95ab3f63c494911002c1734089548a
    175 touse_key adf8e13d1b170fcef7132ea3178a2d6c
    6
    172 present_key adf8e13d1b170fcef7132ea3178a2d6c
    175 touse_key 6d95ab3f63c494911002c1734089548a
    6
    172 present_key 6d95ab3f63c494911002c1734089548a
    175 touse_key adf8e13d1b170fcef7132ea3178a2d6c
    4
    172 present_key adf8e13d1b170fcef7132ea3178a2d6c
    175 touse_key c0d76e9fa950d0ff1761d56bd78a902e

    def change_key():
        global touse_key
        mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
        print(mean_use_key)
        for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
            key = KEY_POOL_LIST[i]
            if key == touse_key:
                continue
            print(172, 'present_key', touse_key)
            touse_key = key
            url = URL_FOR_CHANGE_KEY % (touse_key)
            print(175, 'touse_key', touse_key)
            try:
                r = requests.get(url)
                try :
                    json_ = r.json()
                except Exception:
                    print(' r.json()',Exception)
                    change_key()
            except Exception:
                print('requests.get(url)',Exception)
                change_key()
            infocode = json_['infocode']
            if not infocode == INFOCODE_OK:
                if i == KEY_POOL_NUM_INDICATOR:
                    sys.exit('NOInvalidKEY')
                change_key()
    

      

    # -*- coding: UTF-8 -*-
    
    import re
    import pprint
    import json
    import time
    import math
    import sys
    import requests
    import threading
    from time import ctime, sleep
    
    ZHITONGZI_CITY_DIC = {}
    f = open('直筒子市_东莞中山.txt', 'r', encoding='utf-8')
    ZHITONGZI_CITY_DIC['东莞市'] = []
    ZHITONGZI_CITY_DIC['中山市'] = []
    c = 0
    for i in f:
        ii = i.split('')
        for iii in ii:
            iv = iii.split('')
            if len(iv) > 2:
                c += 1
                for v in iv:
                    if v.find('') > -1:
                        v_ = v.split('')[1]
                    elif v.find('') > -1:
                        v_ = v.split('')[0]
                    else:
                        v_ = v
                    if c == 1 or c == 2:
                        ZHITONGZI_CITY_DIC['东莞市'].append(v_)
                    elif c == 3 or c == 4:
                        ZHITONGZI_CITY_DIC['中山市'].append(v_)
    f.closed
    
    
    def chk_is_coffee(str):
        l_ = ['coffee', 'coffe', 'cafe', 'café', 'starbucks', 'caffé']
        # 上岛花式铁板烧    日本菜
        # 泛太平洋大酒店面馆    其他
        l_b = ['咖啡', '星巴克']
        # 星巴克
        for i in l_:
            if str.upper().find(i.upper()) != -1:
                return True
        for i in l_b:
            if str.find(i) != -1:
                return True
        return False
    
    
    def chk_kfc_mdl(str):
        if str.find(u"麦当劳") != -1:
            return 1
        elif str.find(u"肯德基") != -1 or str.upper().find(u"KFC") != -1:
            return 0
        else:
            return 2
    
    
    def get_name(str):
        if str.find("麦当劳") != -1:
            return '麦当劳'
        elif str.find("肯德基") != -1 or str.upper().find(u"KFC") != -1:
            return '肯德基'
        else:
            # str = '狗不理包子(前门店)'
            # str =  '(清真)三羊水饺(新民路店)'
            # | 添椒 | 潮涮三国IP火锅
            if str.find('') == -1 and str.find('(') == -1:
                return str
            res = str.strip(' ').split('')[0].strip(' ')
            if len(res) == 0:
                try:
                    res = str.split('')[1].split('(')[0]
                except Exception:
                    print(Exception)
            # 一锅两头牛(烟青路店)
            res_b = res
            try:
                res_b = res.split('(')[0]
            except Exception:
                print(Exception)
    
            return res_b
    
    
    def chk_city_district(str):
        city_district = str.replace(' ', '')
        if re.match(r".*[a-zA-Z0-9]", city_district) is not None:
            return False
        elif str.find("[") != -1 or str.find("(") != -1 or str.find(")") != -1:
            return False
        else:
            return city_district
    
    
    def chk_catering_kind(str):
        catering_kind = str.replace(' ', '')
        if re.match(r".*[0-9]", catering_kind) is not None:
            return False
        else:
            return catering_kind
    
    
    # ['a','',' ']
    def chk_list_thickness(list_):
        if len(list_) == 0:
            return False
        res_list = []
        for i in list_:
            i_b = i.replace(' ', '')
            if i.replace(' ', '') == '':
                return False
            else:
                res_list.append(i_b)
        return res_list
    
    
    business_area_tag_list = ['大厦', '大楼', '大厦', '百货', '购物中心', '商业中心', 'MALL', '广场', '商场', '单元', '', '', '', '', '底商']
    
    
    def chk_in_business_area(str):
        global business_area_tag_list
        for i in business_area_tag_list:
            if str.find(i) > -1:
                return 1
        return 0
    
    
    # MAX_OFFSET = 25
    # OFFSET = MAX_OFFSET - 1
    MAX_PAGINATION = 100
    pagination = MAX_PAGINATION
    QPS = 50
    TIME_UNIT = 1
    # http://lbs.amap.com/api/webservice/guide/tools/info
    INFOCODE_OK = '10000'
    file_name_key_pool = 'key_pool.pool'
    KEY_POOL_LIST = []
    touse_key = ''
    f = open(file_name_key_pool, 'r', encoding='utf-8')
    for i in f:
        try:
            list_ = i.split('	')
            key = i.split('	')[1].split()
            KEY_POOL_LIST.append(key[0])
        except Exception:
            print(Exception)
    KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST) - 1
    
    # 北京市    西城区    金堂羊蝎子火锅    真武庙四条1号
    # http://restapi.amap.com/v3/place/around?parameters
    URL_TYPE = 'http://restapi.amap.com/v3/place/text'
    # URL_TYPE = 'http://restapi.amap.com/v3/around'
    touse_key = ''
    RADIUS = '&radius=20'
    keywords = '&keywords='
    OFFSET = '&offset=10'
    CITYLIMIT = '&citylimit=true'
    
    URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
    
    
    def change_key():
        global touse_key
        for i in range(0, KEY_POOL_NUM_INDICATOR, 1):
            key = KEY_POOL_LIST[i]
            if key == touse_key:
                continue
            touse_key = key
            url = URL_FOR_CHANGE_KEY % (touse_key)
            print(62, 'chk_key', url)
            print(62, 'touse_key', touse_key)
            try:
                r = requests.get(url)
            except Exception:
                print(Exception)
                change_key()
            json_ = r.json()
            infocode = json_['infocode']
            if not infocode == INFOCODE_OK:
                if i == KEY_POOL_NUM_INDICATOR:
                    sys.exit('NOInvalidKEY')
                change_key()
    
    
    # 060101    购物服务    商场    购物中心
    FILTER_GD_BUSINESS_AREA_TYPE_LIST = ['购物中心']
    
    def fliter_gd_business_area_type(url):
        global FILTER_GD_BUSINESS_AREA_TYPE_LIST
        # {"suggestion":{"keywords":{},"cities":{}},"info":"OK","status":"1","count":"1","pois":[{"typecode":"050118","adname":"西城区","biz_type":"diner","id":"B0FFG8RYC7","pname":"北京市","importance":{},"location":"116.393715,39.957242","distance":{},"tel":"18600185618","biz_ext":{},"shopid":{},"address":"德胜街道六铺炕北小街8-1号","poiweight":{},"cityname":"北京市","type":"餐饮服务;中餐厅;特色/地方风味餐厅","shopinfo":"0","name":"炙热青春"}],"infocode":"10000"
        try:
            r = requests.get(url)
            r_json = r.json()
        except Exception:
            print(Exception)
            print(195, url)
            return 0
        infocode = r_json['infocode']
        if infocode == '10000':
            count = r_json['count']
            if int(count) > 0:
                pois_list = r_json['pois']
                for l in pois_list:
                    type = l['type']
                    for chk_type in FILTER_GD_BUSINESS_AREA_TYPE_LIST:
                        if type.find(chk_type) > -1:
                            return 1
        else:
            change_key()
        return 0
    
    
    
    # f = open('MEITUAN_DAZHONG_20170704.csv', 'r', encoding='gbk')
    # f_2 = open('MEITUAN_DAZHONG_20170705.csv', 'r', encoding='gbk')
    # f_gd = open('GD_POI_KFC_MC.csv', 'r', encoding='utf-8-sig')
    
    # 市key-区key-品牌名key-地址key-{菜类,均价,data_from}
    # {city:{district:{name:{address:{catering_kind,average_price,data_from}}}}}
    cater_dic = {}
    # [{city,district,address,name,catering_kind,average_price,data_from}]
    cater_exception_list = []
    count_catering = 0
    count_catering_exception = 0
    
    coffee_list = []
    count_coffee = 0
    
    fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r')
    
    file_line_list = []
    for i in fo:
        file_line_list.append(i)
    fo.closed
    #
    # fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r')
    # while fo.readline():
    #     file_line_list_b.append(fo.readline())
    file_line_list_len = len(file_line_list)
    file_jump_step_num = 10000
    count_catering_exception = 0
    count_coffee = 0
    count_catering = 0
    
    
    def get_exception_logic_split_loop(nloop):
        print(247,nloop)
        global touse_key,cater_dic, file_line_list, file_line_list_len, file_jump_step_num, count_catering_exception, count_coffee, count_catering
        start_line = nloop * file_jump_step_num
        if start_line >= file_line_list_len:
            print('last-line')
            return
        else:
            start_line_count = 0
            end_line = start_line + file_jump_step_num
            if end_line >= file_line_list_len:
                end_line = file_line_list_len - 1
            for i in range(start_line, end_line, 1):
                l_ = file_line_list[i].replace('
    ', '').split(',')
                city = l_[0]
                district = l_[1]
                address = l_[2]
                name = l_[3]
                average_price = l_[4]
                catering_kind = l_[5]
                data_from = 'mtdz_5'
                # 数据准备层
                # 数据运算层
                # 该层处理从目标文件取出的字段列表
                focus_list = [city, district, address, name, catering_kind, average_price, data_from]
                dic_exception = {}
                dic_exception['data_from'] = data_from
                dic_exception['city'] = city
                dic_exception['district'] = district
                dic_exception['name'] = name
                dic_exception['address'] = address
                dic_exception['catering_kind'] = catering_kind
                dic_exception['average_price'] = average_price
    
                if not chk_list_thickness(focus_list) or not chk_city_district(city) or not chk_city_district(
                        district) or not chk_catering_kind(catering_kind):
                    count_catering_exception += 1
                    cater_exception_list.append(dic_exception)
                else:
                    name = get_name(name)
    
                    m = chk_is_coffee(name)
                    # if m:
                    #     print(list_)
                    if not m:
                        m = chk_is_coffee(catering_kind)
                    if m:
                        count_coffee += 1
                        coffee_list.append(dic_exception)
    
                    if not m:
                        dic_details = {}
                        dic_details['data_from'] = data_from
                        dic_details['catering_kind'] = catering_kind
                        dic_details['average_price'] = average_price
                        if_in_business_area = chk_in_business_area(address)
                        if_in_business_area_criterion = 'str_match'
                        if if_in_business_area == 0:
                            city_r = '&city=' + district
                            keywords = '&keywords=' + address + '|' + name
                            start_line_count += 1
                            print(417, start_line,start_line_count)
                            if start_line_count%QPS == 0:
                                print('sleep')
                                sleep(1)
                            url = URL_TYPE + '?' + 'key=' + touse_key + RADIUS + keywords + city_r + CITYLIMIT
    
                            if_in_business_area = fliter_gd_business_area_type(url)
                            if_in_business_area_criterion = 'str_match+request_api'
    
                        dic_details['if_in_business_area_criterion'] = if_in_business_area_criterion
                        dic_details['if_in_business_area'] = if_in_business_area
    
                        if city not in cater_dic:
                            cater_dic[city] = {}
                        if district not in cater_dic[city]:
                            cater_dic[city][district] = {}
                        if name not in cater_dic[city][district]:
                            cater_dic[city][district][name] = {}
                        if address not in cater_dic[city][district][name]:
                            cater_dic[city][district][name][address] = {}
    
                        cater_dic[city][district][name][address] = dic_details
                        count_catering += 1
    
    
    
    class MyThread(threading.Thread):
        def __init__(self,func,args,name=''):
            threading.Thread.__init__(self)
            self.name = name
            self.func = func
            self.args = args
        def run(self):
            self.func(self.args)
    
    def main():
        print('starting at:',ctime())
        threads_list = []
        thread_sum = math.ceil(file_line_list_len / file_jump_step_num)
        print(thread_sum)
        for nloop in range(1, thread_sum, 1):
            print(nloop)
            thread_instance = MyThread(get_exception_logic_split_loop,(nloop),get_exception_logic_split_loop.__name__)
            print(353,'123')
            threads_list.append(thread_instance)
        # 主进程将在所有非守护进程退出后,退出
        for t in threads_list:
            print(t)
            t.setDaemon = False
            t.start()
        # wait for all thrades to finish
        for t in threads_list:
            t.join()
    
        print(467, cater_dic)
        f_name = 'ALL.csv'
        f = open(f_name, 'w', encoding='utf-8-sig')
        f.write('')
        f.closed
        f = open(f_name, 'a', encoding='utf-8-sig')
        str = 'city, district, name, address, if_in_business_area, if_in_business_area_criterion,catering_kind, average_price, data_from
    '
        f.write(str)
        ## city,district,address,name,catering_kind,average_price,data_from
        count_write_rows = 0
        for i in cater_dic:
            city = i
            if city == '城市':
                continue
            for ii in cater_dic[i]:
                district = ii
                for iii in cater_dic[i][ii]:
                    name = iii
                    for iv in cater_dic[i][ii][iii]:
                        address = iv
                        catering_kind = cater_dic[i][ii][iii][iv]['catering_kind']
                        average_price = cater_dic[i][ii][iii][iv]['average_price']
                        if_in_business_area = cater_dic[i][ii][iii][iv]['if_in_business_area']
                        if_in_business_area_criterion = cater_dic[i][ii][iii][iv]['if_in_business_area_criterion']
                        data_from = cater_dic[i][ii][iii][iv]['data_from']
                        str = '%s,%s,%s,%s,%s,%s,%s,%s,%s
    ' % (
                            city, district, name, address, if_in_business_area, if_in_business_area_criterion,
                            catering_kind, average_price, data_from)
                        f.write(str)
                        count_write_rows += 1
        f.closed
        print(count_write_rows)
    
    if __name__ == '__main__':
        main()

    while fo.readline() 少统计了??

    # -*- coding: UTF-8 -*-
    
    import re
    import pprint
    import json
    import time
    import math
    import sys
    import requests
    import threading
    from time import ctime, sleep
    
    ZHITONGZI_CITY_DIC = {}
    f = open('直筒子市_东莞中山.txt', 'r', encoding='utf-8')
    ZHITONGZI_CITY_DIC['东莞市'] = []
    ZHITONGZI_CITY_DIC['中山市'] = []
    c = 0
    for i in f:
        ii = i.split(';')
        for iii in ii:
            iv = iii.split('、')
            if len(iv) > 2:
                c += 1
                for v in iv:
                    if v.find('(') > -1:
                        v_ = v.split('(')[1]
                    elif v.find(')') > -1:
                        v_ = v.split(')')[0]
                    else:
                        v_ = v
                    if c == 1 or c == 2:
                        ZHITONGZI_CITY_DIC['东莞市'].append(v_)
                    elif c == 3 or c == 4:
                        ZHITONGZI_CITY_DIC['中山市'].append(v_)
    f.closed
    
    
    def chk_is_coffee(str):
        l_ = ['coffee', 'coffe', 'cafe', 'café', 'starbucks', 'caffé']
        # 上岛花式铁板烧	日本菜
        # 泛太平洋大酒店面馆	其他
        l_b = ['咖啡', '星巴克']
        # 星巴克
        for i in l_:
            if str.upper().find(i.upper()) != -1:
                return True
        for i in l_b:
            if str.find(i) != -1:
                return True
        return False
    
    
    def chk_kfc_mdl(str):
        if str.find(u"麦当劳") != -1:
            return 1
        elif str.find(u"肯德基") != -1 or str.upper().find(u"KFC") != -1:
            return 0
        else:
            return 2
    
    
    def get_name(str):
        if str.find("麦当劳") != -1:
            return '麦当劳'
        elif str.find("肯德基") != -1 or str.upper().find(u"KFC") != -1:
            return '肯德基'
        else:
            # str = '狗不理包子(前门店)'
            # str =  '(清真)三羊水饺(新民路店)'
            # | 添椒 | 潮涮三国IP火锅
            if str.find('(') == -1 and str.find('(') == -1:
                return str
            res = str.strip(' ').split('(')[0].strip(' ')
            if len(res) == 0:
                try:
                    res = str.split(')')[1].split('(')[0]
                except Exception:
                    print(Exception)
            # 一锅两头牛(烟青路店)
            res_b = res
            try:
                res_b = res.split('(')[0]
            except Exception:
                print(Exception)
    
            return res_b
    
    
    def chk_city_district(str):
        city_district = str.replace(' ', '')
        if re.match(r".*[a-zA-Z0-9]", city_district) is not None:
            return False
        elif str.find("[") != -1 or str.find("(") != -1 or str.find(")") != -1:
            return False
        else:
            return city_district
    
    
    def chk_catering_kind(str):
        catering_kind = str.replace(' ', '')
        if re.match(r".*[0-9]", catering_kind) is not None:
            return False
        else:
            return catering_kind
    
    
    # ['a','',' ']
    def chk_list_thickness(list_):
        if len(list_) == 0:
            return False
        res_list = []
        for i in list_:
            i_b = i.replace(' ', '')
            if i.replace(' ', '') == '':
                return False
            else:
                res_list.append(i_b)
        return res_list
    
    
    business_area_tag_list = ['大厦', '大楼', '大厦', '百货', '购物中心', '商业中心', 'MALL', '广场', '商场', '单元', '栋', '座', '楼', '层', '底商']
    
    
    def chk_in_business_area(str):
        global business_area_tag_list
        for i in business_area_tag_list:
            if str.find(i) > -1:
                return 1
        return 0
    
    
    # MAX_OFFSET = 25
    # OFFSET = MAX_OFFSET - 1
    MAX_PAGINATION = 100
    pagination = MAX_PAGINATION
    QPS = 50
    TIME_UNIT = 1
    # http://lbs.amap.com/api/webservice/guide/tools/info
    INFOCODE_OK = '10000'
    file_name_key_pool = 'key_pool.pool'
    KEY_POOL_LIST = []
    touse_key = ''
    f = open(file_name_key_pool, 'r', encoding='utf-8')
    for i in f:
        try:
            list_ = i.split('	')
            key = i.split('	')[1].split()
            KEY_POOL_LIST.append(key[0])
        except Exception:
            print(Exception)
    KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST) - 1
    
    # 北京市	西城区	金堂羊蝎子火锅	真武庙四条1号
    # http://restapi.amap.com/v3/place/around?parameters
    URL_TYPE = 'http://restapi.amap.com/v3/place/text'
    # URL_TYPE = 'http://restapi.amap.com/v3/around'
    touse_key = ''
    RADIUS = '&radius=20'
    keywords = '&keywords='
    OFFSET = '&offset=10'
    CITYLIMIT = '&citylimit=true'
    
    URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
    
    
    def change_key():
        global touse_key
        for i in range(0, KEY_POOL_NUM_INDICATOR, 1):
            key = KEY_POOL_LIST[i]
            if key == touse_key:
                continue
            touse_key = key
            url = URL_FOR_CHANGE_KEY % (touse_key)
            print(62, 'chk_key', url)
            print(62, 'touse_key', touse_key)
            try:
                r = requests.get(url)
            except Exception:
                print(Exception)
                change_key()
            json_ = r.json()
            infocode = json_['infocode']
            if not infocode == INFOCODE_OK:
                if i == KEY_POOL_NUM_INDICATOR:
                    sys.exit('NOInvalidKEY')
                change_key()
    
    
    # 060101	购物服务	商场	购物中心
    FILTER_GD_BUSINESS_AREA_TYPE_LIST = ['购物中心']
    
    def fliter_gd_business_area_type(url):
        global FILTER_GD_BUSINESS_AREA_TYPE_LIST
        # {"suggestion":{"keywords":{},"cities":{}},"info":"OK","status":"1","count":"1","pois":[{"typecode":"050118","adname":"西城区","biz_type":"diner","id":"B0FFG8RYC7","pname":"北京市","importance":{},"location":"116.393715,39.957242","distance":{},"tel":"18600185618","biz_ext":{},"shopid":{},"address":"德胜街道六铺炕北小街8-1号","poiweight":{},"cityname":"北京市","type":"餐饮服务;中餐厅;特色/地方风味餐厅","shopinfo":"0","name":"炙热青春"}],"infocode":"10000"
        try:
            r = requests.get(url)
            r_json = r.json()
        except Exception:
            print(Exception)
            print(195, url)
            return 0
        infocode = r_json['infocode']
        if infocode == '10000':
            count = r_json['count']
            if int(count) > 0:
                pois_list = r_json['pois']
                for l in pois_list:
                    type = l['type']
                    for chk_type in FILTER_GD_BUSINESS_AREA_TYPE_LIST:
                        if type.find(chk_type) > -1:
                            return 1
        else:
            change_key()
        return 0
    
    
    
    # f = open('MEITUAN_DAZHONG_20170704.csv', 'r', encoding='gbk')
    # f_2 = open('MEITUAN_DAZHONG_20170705.csv', 'r', encoding='gbk')
    # f_gd = open('GD_POI_KFC_MC.csv', 'r', encoding='utf-8-sig')
    
    # 市key-区key-品牌名key-地址key-{菜类,均价,data_from}
    # {city:{district:{name:{address:{catering_kind,average_price,data_from}}}}}
    cater_dic = {}
    # [{city,district,address,name,catering_kind,average_price,data_from}]
    cater_exception_list = []
    count_catering = 0
    count_catering_exception = 0
    
    coffee_list = []
    count_coffee = 0
    
    fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r', encoding='gbk')
    
    # line = fo.readline()
    file_line_list = []
    while fo.readline():
        file_line_list.append(fo.readline())
    file_line_list_len = len(file_line_list)
    file_jump_step_num = 5000
    count_catering_exception = 0
    count_coffee = 0
    count_catering = 0
    
    
    def get_exception_logic_split_loop(nloop):
        print(247,nloop)
        global touse_key,cater_dic, file_line_list, file_line_list_len, file_jump_step_num, count_catering_exception, count_coffee, count_catering
        start_line = nloop * file_jump_step_num
        if start_line >= file_line_list_len:
            print('last-line')
            return
        else:
            start_line_count = 0
            end_line = start_line + file_jump_step_num
            if end_line >= file_line_list_len:
                end_line = file_line_list_len - 1
            for i in range(start_line, end_line, 1):
                l_ = file_line_list[i].replace('
    ', '').split(',')
                city = l_[0]
                district = l_[1]
                address = l_[2]
                name = l_[3]
                average_price = l_[4]
                catering_kind = l_[5]
                data_from = 'mtdz_5'
                # 数据准备层
                # 数据运算层
                # 该层处理从目标文件取出的字段列表
                focus_list = [city, district, address, name, catering_kind, average_price, data_from]
                dic_exception = {}
                dic_exception['data_from'] = data_from
                dic_exception['city'] = city
                dic_exception['district'] = district
                dic_exception['name'] = name
                dic_exception['address'] = address
                dic_exception['catering_kind'] = catering_kind
                dic_exception['average_price'] = average_price
    
                if not chk_list_thickness(focus_list) or not chk_city_district(city) or not chk_city_district(
                        district) or not chk_catering_kind(catering_kind):
                    count_catering_exception += 1
                    cater_exception_list.append(dic_exception)
                else:
                    name = get_name(name)
    
                    m = chk_is_coffee(name)
                    # if m:
                    #     print(list_)
                    if not m:
                        m = chk_is_coffee(catering_kind)
                    if m:
                        count_coffee += 1
                        coffee_list.append(dic_exception)
    
                    if not m:
                        dic_details = {}
                        dic_details['data_from'] = data_from
                        dic_details['catering_kind'] = catering_kind
                        dic_details['average_price'] = average_price
                        if_in_business_area = chk_in_business_area(address)
                        if_in_business_area_criterion = 'str_match'
                        if if_in_business_area == 0:
                            city_r = '&city=' + district
                            keywords = '&keywords=' + address + '|' + name
                            start_line_count += 1
                            print(417, start_line,start_line_count)
                            if start_line_count%QPS == 0:
                                print('sleep')
                                sleep(1)
                            url = URL_TYPE + '?' + 'key=' + touse_key + RADIUS + keywords + city_r + CITYLIMIT
    
                            if_in_business_area = fliter_gd_business_area_type(url)
                            if if_in_business_area == 1:
                                if_in_business_area_criterion = 'request_api'
    
                        dic_details['if_in_business_area_criterion'] = if_in_business_area_criterion
                        dic_details['if_in_business_area'] = if_in_business_area
    
                        if city not in cater_dic:
                            cater_dic[city] = {}
                        if district not in cater_dic[city]:
                            cater_dic[city][district] = {}
                        if name not in cater_dic[city][district]:
                            cater_dic[city][district][name] = {}
                        if address not in cater_dic[city][district][name]:
                            cater_dic[city][district][name][address] = {}
    
                        cater_dic[city][district][name][address] = dic_details
                        count_catering += 1
    
    
    
    class MyThread(threading.Thread):
        def __init__(self,func,args,name=''):
            threading.Thread.__init__(self)
            self.name = name
            self.func = func
            self.args = args
        def run(self):
            self.func(self.args)
    
    def main():
        print('starting at:',ctime())
        threads_list = []
        thread_sum = math.floor(file_line_list_len / file_jump_step_num)
        print(thread_sum)
        for nloop in range(1, thread_sum, 1):
            print(nloop)
            thread_instance = MyThread(get_exception_logic_split_loop,(nloop),get_exception_logic_split_loop.__name__)
            print(353,'123')
            threads_list.append(thread_instance)
        # 主进程将在所有非守护进程退出后,退出
        for t in threads_list:
            print(t)
            t.setDaemon = False
            t.start()
        # wait for all thrades to finish
        for t in threads_list:
            t.join()
    
        print(467, cater_dic)
        f_name = 'ALL.csv'
        f = open(f_name, 'w', encoding='utf-8-sig')
        f.write('')
        f.closed
        f = open(f_name, 'a', encoding='utf-8-sig')
        str = '市,区,品牌名,地址,是否在商场,菜别(类型),均价,data_from,
    '
        f.write(str)
        ## city,district,address,name,catering_kind,average_price,data_from
        count_write_rows = 0
        for i in cater_dic:
            city = i
            if city == '城市':
                continue
            for ii in cater_dic[i]:
                district = ii
                for iii in cater_dic[i][ii]:
                    name = iii
                    for iv in cater_dic[i][ii][iii]:
                        address = iv
                        catering_kind = cater_dic[i][ii][iii][iv]['catering_kind']
                        average_price = cater_dic[i][ii][iii][iv]['average_price']
                        if_in_business_area = cater_dic[i][ii][iii][iv]['if_in_business_area']
                        if_in_business_area_criterion = cater_dic[i][ii][iii][iv]['if_in_business_area_criterion']
                        data_from = cater_dic[i][ii][iii][iv]['data_from']
                        str = '%s,%s,%s,%s,%s,%s,%s,%s,%s
    ' % (
                            city, district, name, address, if_in_business_area, if_in_business_area_criterion,
                            catering_kind, average_price, data_from)
                        f.write(str)
                        count_write_rows += 1
        f.closed
        print(count_write_rows)
    
    if __name__ == '__main__':
        main()
    

      

  • 相关阅读:
    谦卑
    自尊和自我效能
    二手时间读书笔记
    vim学习4
    vim学习3
    hdu 5122 K.Bro Sorting
    hdu 5113 Black And White
    poj 2479 Maximum sum
    poj 2392 Space Elevator
    poj 3666 Making the Grade
  • 原文地址:https://www.cnblogs.com/rsapaper/p/7211266.html
Copyright © 2020-2023  润新知