• Is this its limit?


    import sys
    import os
    
    curPath = os.path.abspath(os.path.dirname(__file__))
    rootPath = os.path.split(curPath)[0]
    sys.path.append(rootPath)
    
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    # from selenium.webdriver.firefox.options import Options
    import time
    from time import sleep
    import math
    import random
    import sys
    import threading
    import urllib.parse
    import xlrd
    import sys
    import os
    import sqlite3
    
    MAX_TIME = 600
    
    
    def py_stop_update_db():
        # db = 'py_bdspider_status.db'
        # db = '%s\%s' % (curPath, db)
        # conn = sqlite3.connect(db)
        # pyname = os.path.basename(__file__).split('.py')[0]
        # sql_ = '%s%s%s' % ('UPDATE pystatus_table SET pystatus =2 WHERE pyname="', pyname, '"')
        # print(sql_)
        # conn.execute(sql_)
        # conn.commit()
        # conn.close()
        return
    
    
    def chk_time(browser, start_time):
        if time.time() - start_time > MAX_TIME:
            py_stop_update_db()
            browser.delete_all_cookies()
            browser.quit()
        return
    
    
    dir_html = 'baidu_map_html_firstpage_pc_not_shop'
    filepath = '%s\%s' % (curPath, dir_html)
    requested_file_list = []
    pathDir = os.listdir(filepath)
    for allDir in pathDir:
        child = os.path.join('%s%s' % (filepath, allDir))
        requested_file = child.split(dir_html)[1].split('&')[0].split('.html')[0]
        requested_file_list.append(requested_file)
    
    tag_jmtool_list = ['(', '(', '-']
    
    
    def extract_name(name_):
        for i in tag_jmtool_list:
            name_ = name_.split(i)[0]
        return name_
    
    
    pcity_list = []
    pcity_file = '%s\%s' % (curPath, '省会城市.txt')
    with open(pcity_file, 'r', encoding='utf-8') as pf:
        c_ = 0
        for i in pf:
            c_ += 1
            if c_ == 3:
                c_ = 0
                pcity_list.append(i.replace(' ', '').replace('
    ', '') + '市')
    pcity_sorted_list = sorted(pcity_list)
    
    target_type_list = ['住宅小区', '写字楼']
    # target_type_list = ['住宅小区']
    target_type_list = ['专科医院']
    target_type_list = ['商场']
    requested_type_counter = 0
    # 商场 4705 酒店 24915 专科医院 2513 商圈 334
    target_dic = {}
    # target_city_list = ['北京市', '上海市', '深圳市', '广州市']
    target_city_list = ['深圳市', '广州市']
    target_city_list = ['深圳市']
    target_city_list = ['北京市', '上海市']
    target_city_list = ['北京市', '上海市', '深圳市', '广州市']
    target_city_list = ['北京市', '上海市']
    target_city_list = ['深圳市', '广州市']
    target_city_list = ['北京市']
    target_city_list = ['北京市', '上海市', '深圳市', '广州市']
    target_city_list = pcity_sorted_list[21:28]
    #pcity_sorted_list[7:14]
    #target_city_list = pcity_sorted_list
    
    # target_city_list = ['杭州市']
    file_name = 'JMTool任务_csv_py_wholeCSV'
    
    FEXCEL = '%s\%s%s' % (curPath, file_name, '.xlsx')
    data = xlrd.open_workbook(FEXCEL)
    table = data.sheets()[0]
    nrows, ncols = table.nrows, table.ncols
    res_dic, counter_ = {}, 0
    for i in range(0, nrows):
        l = table.row_values(i)
        dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, emp_, emp_1 = l
        if city not in target_city_list:
            continue
        # if city not in target_city_list:
        #     target_city_list.append(city)
        type_ = ref_area_type_code
        if type_ not in target_type_list:
            continue
        name_ = name_.replace('?', '')
        name_reduction = extract_name(name_)
        if len(name_reduction) < 3:
            name_reduction = name_
        if city not in target_dic:
            target_dic[city] = {}
        if district not in target_dic[city]:
            target_dic[city][district] = {}
        if type_ not in target_dic[city][district]:
            target_dic[city][district][type_] = {}
        if name_reduction not in target_dic[city][district]:
            target_dic[city][district][type_][name_reduction] = {}
            target_dic[city][district][type_][name_reduction]['name_reduction_list'] = []
            target_dic[city][district][type_][name_reduction]['history_list'] = []
        try:
            target_dic[city][district][type_][name_reduction]['name_reduction_list'].append(name_)
            target_dic[city][district][type_][name_reduction]['history_list'].append(l)
        except Exception:
            print(Exception)
    
    write_res_html_dir = '%s\%s\' % (curPath, dir_html)
    
    
    def write_res_html(browser, dir_=write_res_html_dir):
        close_alert(browser)
        current_url_ = urllib.parse.unquote(browser.current_url)
        try:
            input_ = current_url_.split('&wd=')[1].split('/?')[0]
        except Exception:
            print('Exception-', __file__, sys._getframe().f_lineno, current_url_)
            return
        current_url_ = '%s%s%s' % ('<!--', browser.current_url, '-->')
        page_source = '%s%s' % (current_url_, browser.page_source)
        # localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
        # file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html')
        file_name = '%s%s%s' % (dir_, input_, '.html')
        fo = open(file_name, 'w', encoding='utf-8')
        fo.write(page_source)
        fo.closed
        print(os.path.basename(__file__), 'OK-writed-', sys._getframe().f_lineno, '')
    
    
    def gen_random_letter():
        return chr(random.randint(97, 122))
    
    
    def gen_random_num():
        return random.randint(0, 10)
    
    
    def gen_sougo_pid():
        res_ = ''
        for i in range(1, 17, 1):
            if i in [1, 3, 4, 15]:
                res_ = '%s%s' % (res_, gen_random_letter())
            else:
                res_ = '%s%s' % (res_, gen_random_num())
        return res_
    
    
    def close_alert(browser, attitude='accept'):
        return
    
    
    # executable_path_str = '%s\%s' % (curPath, 'geckodriver.exe')
    # browser = webdriver.Firefox(executable_path=executable_path_str)
    
    
    
    def mobile_mobile_pages_html(browser, input_):
        start_time = time.time()
        chk_time(browser, start_time)
        sleep(3)
        url_ = '%s%s' % ('http://map.baidu.com/?s=s%26wd%3D', input_)
        browser.get(url_)
        write_res_html(browser)
    
    
    class MyThread(threading.Thread):
        def __init__(self, func, args, name):
            threading.Thread.__init__(self)
            self.name, self.func, self.args = name, func, args
    
        def run(self):
            self.func(self.args)
    
    
    def thread_city(city):
        global requested_type_counter
        for district in target_dic[city]: 
            for type_ in target_dic[city][district]:
                for name_reduction in target_dic[city][district][type_]:
                    for name_ in target_dic[city][district][type_][name_reduction]['name_reduction_list']:
                        input_ = '%s%s%s' % (city, district, name_)
                        if input_ in requested_file_list:
                            requested_type_counter += 1
                            print('requested_type_counter=', requested_type_counter, input_)
                        else:
                            # executable_path_str = '%s\%s' % (curPath, 'chromedriver.exe')
                            # browser = webdriver.Chrome(executable_path=executable_path_str)
    
                            executable_path_str = '%s\%s' % (curPath, 'geckodriver.exe')
                            browser = webdriver.Firefox(executable_path=executable_path_str)
                            mobile_mobile_pages_html(browser, input_)
    
    
    threads_list = []
    for city in target_dic:
        thread_instance = MyThread(thread_city, (city), thread_city.__name__)
        threads_list.append(thread_instance)
    for t in threads_list:
        t.setDaemon = False
        t.start()
    for t in threads_list:
        t.join()
    
    # browser.delete_all_cookies()
    # browser.quit()
    

      

  • 相关阅读:
    csharp: Cyotek.GhostScript.PdfConversion pdf convert image
    csharp: using Acrobat.dll pdf convert images in winform
    机器学习实战---K均值聚类算法
    机器学习实战---决策树CART回归树实现
    机器学习实战---决策树CART简介及分类树实现
    机器学习实战---线性回归(更好的使用正规方程求解)
    机器学习实战---逻辑回归梯度上升(更好的理解sigmoid函数的含义并改进)
    机器学习实战---朴素贝叶斯算法使用K折交叉验证
    机器学习实战---朴素贝叶斯算法
    机器学习实战---决策树ID3算法
  • 原文地址:https://www.cnblogs.com/rsapaper/p/7440608.html
Copyright © 2020-2023  润新知