• 算法调参 weight_ratio, weight_seqratio


     

    from openpyxl import Workbook
    import xlrd
    import time
    import Levenshtein as Le
    
    target_city_list = ['北京市', '上海市', '深圳市', '广州市']
    source_name = 'JMTool任务_csv_py_wholeCSV-加百度170826165729'
    BDpoi_list_tag, BDpoi_list_tagb = '|-|', '|--|'
    FEXCEL = '%s%s' % (source_name, '.xlsx')
    
    weight_ratio, weight_seqratio = 0.7, 0.3
    
    
    def main_():
        global source_name
        data = xlrd.open_workbook(FEXCEL)
        table = data.sheets()[0]
        nrows, ncols = table.nrows, table.ncols
        res_dic = {}
        for i in range(0, nrows):
            l = table.row_values(i)
            dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list = l
            if dbid == 'dbid':
                continue
            if city not in target_city_list:
                continue
            if city not in res_dic:
                res_dic[city] = {}
            if district not in res_dic[city]:
                res_dic[city][district] = {}
            if name_ not in res_dic[city][district]:
                res_dic[city][district][name_] = []
    
            if BDpoi_list.find(BDpoi_list_tag) == -1:
                ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list, '', '', ''
                res_dic[city][district][name_].append(ll)
            else:
                addr_ = '%s%s%s%s' % (city, district, address, city_street)
                chk_name_lsit, cmp_list, sorted_ratio_seqratio_res_dic = [name_, addr_], BDpoi_list.split(
                    BDpoi_list_tag), {}
                for ii in cmp_list:
                    if len(ii) == 0:
                        continue
                    cmp_, BD_name, BD_addr = ['', ''], '', ''
                    cmp_one = ii.split(BDpoi_list_tagb)
                    if len(cmp_one) == 2:
                        # format data -fair
                        BD_name, BD_addr = cmp_[0], cmp_[1] = cmp_one[0], cmp_one[1].replace(city, '').replace(district, '')
                    else:
                        BD_name = cmp_[0] = cmp_one[0]
                    ratio_res, seqratio_res = Le.ratio(name_, BD_name), Le.seqratio(chk_name_lsit, cmp_)
                    ratio_seqratio_res = weight_ratio * ratio_res + weight_seqratio * seqratio_res
                    ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res
                    if ratio_seqratio_res not in sorted_ratio_seqratio_res_dic:
                        sorted_ratio_seqratio_res_dic[ratio_seqratio_res] = []
                    sorted_ratio_seqratio_res_dic[ratio_seqratio_res].append(ll)
                sorted_seqratio_res_list = sorted(sorted_ratio_seqratio_res_dic)
                for ratio_seqratio_res in sorted_seqratio_res_list:
                    lll = sorted_ratio_seqratio_res_dic[ratio_seqratio_res]
                    for vl in lll:
                        res_dic[city][district][name_].append(vl)
    
        wb = Workbook()
        worksheet = wb.active
        file_title_str = ' dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res'
        file_title_l = file_title_str.replace(' ', '').split(',')
        worksheet.append(file_title_l)
        for city in res_dic:
            for district in res_dic[city]:
                for name_ in res_dic[city][district]:
                    l = res_dic[city][district][name_]
                    for ll in l:
                        worksheet.append(ll)
        localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
        file_name = '%s%s%s' % (source_name, '-Levenshtein', localtime_)
        file_name_save = '%s%s' % (file_name, '.xlsx')
        wb.save(file_name_save)
    
        wb = Workbook()
        worksheet = wb.active
        file_title_str = ' dbid, area_code, ref_area_type_code, city, district, address, city_street, name_,BDpoi_list,max_BD_name, max_BD_addr, max_ratio_seqratio_res, ratio_res, seqratio_res'
        file_title_l = file_title_str.replace(' ', '').split(',')
        worksheet.append(file_title_l)
        for city in res_dic:
            for district in res_dic[city]:
                for name_ in res_dic[city][district]:
                    l = res_dic[city][district][name_]
                    lll = l[-1]
    
                    worksheet.append(lll)
        localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
        file_name = '%s%s%s' % (file_name, '-Levenshtein-ordered', localtime_)
        file_name_save = '%s%s' % (file_name, '.xlsx')
        wb.save(file_name_save)
    
    
    main_()
    
    
     
    

      

    from openpyxl import Workbook
    import xlrd
    import time
    import Levenshtein as Le
    
    target_city_list = ['深圳市']
    BDpoi_list_tag, BDpoi_list_tagb = '|-|', '|--|'
    
    source_name = 'JMTool任务_csv_py_wholeCSV_住宅小区-加百度170826152533'
    FEXCEL = '%s%s' % (source_name, '.xlsx')
    weight_ratio, weight_seqratio = 0.7, 0.3
    
    
    def main_():
        global source_name
        data = xlrd.open_workbook(FEXCEL)
        table = data.sheets()[0]
        nrows, ncols = table.nrows, table.ncols
        res_dic = {}
        for i in range(0, nrows):
            l = table.row_values(i)
            dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list = l
            if dbid == 'dbid':
                continue
            if city not in target_city_list:
                continue
            if city not in res_dic:
                res_dic[city] = {}
            if district not in res_dic[city]:
                res_dic[city][district] = {}
            if name_ not in res_dic[city][district]:
                res_dic[city][district][name_] = []
    
            if BDpoi_list.find(BDpoi_list_tag) == -1:
                ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list, '', '', ''
                res_dic[city][district][name_].append(ll)
            else:
                addr_ = '%s%s%s%s' % (city, district, address, city_street)
                chk_name_lsit, cmp_list, sorted_ratio_seqratio_res_dic = [name_reduction, addr_], BDpoi_list.split(
                    BDpoi_list_tag), {}
                for ii in cmp_list:
                    if len(ii) == 0:
                        continue
                    cmp_, BD_name, BD_addr = ['', ''], '', ''
                    cmp_one = ii.split(BDpoi_list_tagb)
                    if len(cmp_one) == 2:
                        # format data -fair
                        BD_name, BD_addr = cmp_[0], cmp_[1] = cmp_one[0], cmp_one[1].replace(city, '').replace(district, '')
                    else:
                        BD_name = cmp_[0] = cmp_one[0]
                    ratio_res, seqratio_res = Le.ratio(name_reduction, BD_name), Le.seqratio(chk_name_lsit, cmp_)
                    ratio_seqratio_res = weight_ratio * ratio_res + weight_seqratio * seqratio_res
                    ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res
                    if ratio_seqratio_res not in sorted_ratio_seqratio_res_dic:
                        sorted_ratio_seqratio_res_dic[ratio_seqratio_res] = []
                    sorted_ratio_seqratio_res_dic[ratio_seqratio_res].append(ll)
                sorted_seqratio_res_list = sorted(sorted_ratio_seqratio_res_dic)
                for ratio_seqratio_res in sorted_seqratio_res_list:
                    lll = sorted_ratio_seqratio_res_dic[ratio_seqratio_res]
                    for vl in lll:
                        res_dic[city][district][name_].append(vl)
    
        wb = Workbook()
        worksheet = wb.active
        file_title_str = 'dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res'
        file_title_l = file_title_str.replace(' ', '').split(',')
        worksheet.append(file_title_l)
        for city in res_dic:
            for district in res_dic[city]:
                for name_ in res_dic[city][district]:
                    l = res_dic[city][district][name_]
                    for ll in l:
                        worksheet.append(ll)
        localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
        file_name = '%s%s%s' % (source_name, '-Levenshtein', localtime_)
        file_name_save = '%s%s' % (file_name, '.xlsx')
        wb.save(file_name_save)
    
        wb = Workbook()
        worksheet = wb.active
        file_title_str = 'dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction,BDpoi_list,max_BD_name, max_BD_addr, max_ratio_seqratio_res, ratio_res, seqratio_res'
        file_title_l = file_title_str.replace(' ', '').split(',')
        worksheet.append(file_title_l)
        for city in res_dic:
            for district in res_dic[city]:
                for name_ in res_dic[city][district]:
                    l = res_dic[city][district][name_]
                    lll = l[-1]
    
                    worksheet.append(lll)
        localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
        file_name = '%s%s%s' % (file_name, '-Levenshtein-ordered', localtime_)
        file_name_save = '%s%s' % (file_name, '.xlsx')
        wb.save(file_name_save)
    
    
    main_()
    

      

  • 相关阅读:
    .html(),.text()和.val()的差异总结:
    获取或设置checkbox radio select的值
    sublime 搜索时忽略文件夹
    转载------一小时包教会 —— webpack 入门指南
    转载--git教程
    转载--网站数据统计分析中的日志收集原理及其实现
    devexpress 安装及破解
    基于socket的客户端和服务端聊天简单使用 附Demo
    Ajax技术原理小结
    oracle 资源学习汇总
  • 原文地址:https://www.cnblogs.com/rsapaper/p/7435841.html
Copyright © 2020-2023  润新知