• word->excel


    # author : aviviguixiang
    # create time : 2018/12/26

    import os
    import docx
    import pandas as pd
    from win32com import client as wc



    def from_doc_to_docx(doc_path, docx_path):
    """

    :param doc_path: doc的路径
    :param docx_path: docx的路径
    :return: 返回docx的绝对路径
    """
    doc_list = os.listdir(doc_path)
    w = wc.Dispatch('Word.Application')
    for doc in doc_list:
    docx = os.path.join(docx_path, doc[:-3] + 'docx')
    if not os.path.exists(docx):
    doc_data = w.Documents.Open(os.path.join(doc_path, doc))
    doc_data.SaveAs(docx, 16)
    print('完成', doc, end='; ')
    doc_data.Close()
    return [os.path.join(docx_path, i) for i in os.listdir(docx_path)]


    def deal_one_docx(docx_path):
    """

    :param docx_path: docx 的绝对路径
    :return: 返回从docx中取出的信息
    """
    file = docx.Document(docx_path)
    # print(dir(file)) # 查看file有什么属性方法可以用
    # ---------------------------------------------------------------------------------
    # TODO:处理段落对象,剔除空段落;经过处理后应该只有6个对象;
    paragraphs = file.paragraphs
    new_paragraphs = [i for i in paragraphs if i.text.strip()]
    # ---------------------------------------------------------------------------------
    # release_department = [i.strip() for i in new_paragraphs[-1].text.split(' ') if '气象局' in i][0]
    release_department = '河南省驻马店市气象局'
    release_time = [i.strip() for i in new_paragraphs[-1].text.split(' ') if '年' in i][0]
    release_content = new_paragraphs[3].text.strip()
    warning_signal = new_paragraphs[2].text.strip()
    others = [i.strip()[1:-1] for i in new_paragraphs[1].text.split(' ') if '第' in i and '号' in i][0]
    release_leader = [i.strip()[3:] for i in new_paragraphs[1].text.split(' ') if '签发' in i][0]
    return release_department, release_content, warning_signal, release_time, release_leader, others


    def from_doc_to_excel(docx_list, fixed_information, excel_col_names,
    output_excel_path=r'D:1205jiejiejiejiedocconsequence.xlsx'):
    """

    :param docx_list: docx文件的绝对路径
    :param fixed_information: excel中不变的信息
    :param excel_col_names: excel的表头
    :param output_excel_path: 存excel的绝对路径
    :return: None
    """

    start_reason, release_channels, service_objects, telephone_condition, disaster_information, telephone_number = fixed_information
    if not os.path.exists(os.path.dirname(output_excel_path)):
    os.makedirs(os.path.dirname(output_excel_path))
    all_consequence = []
    for docx_path in docx_list:
    print(os.path.basename(docx_path)) # os.path.basename(docx_path)返回路径最后的文件名
    release_department, release_content, warning_signal, release_time, release_leader, others = deal_one_docx(
    docx_path)
    consequence = [release_department, release_content, warning_signal, start_reason, release_time, release_leader,
    telephone_number, release_channels, service_objects, telephone_condition, disaster_information,
    others]
    all_consequence.append(consequence) # 在all_consequence中添加consequence
    # print('before',all_consequence)
    all_consequence.sort(key=lambda x: int(x[-1])) # 按other的编号排序
    # print('end',all_consequence)
    data = pd.DataFrame(columns=excel_col_names, index=range(len(all_consequence)))
    for i, consequence in enumerate(all_consequence):
    data.iloc[i, :] = consequence
    data.to_excel(output_excel_path, index=False)



    if __name__ == '__main__':


    doc_path = r'D:1205jiejiejiejiedocdoc' # word所在的目录
    docx_path = r'D:1205jiejiejiejiedocdocx' # 存docx的目录
    # ------------------------------固定信息-------------------------------------------------------
    telephone_number = '0396-2668798'
    release_channels = '手机短信、国突平台、12121'
    service_objects = '市党政领导、各局委、相关单位、媒体记者等'
    start_reason = ''
    telephone_condition = ''
    disaster_information = ''
    FIXED_INFORMATION = [start_reason, release_channels, service_objects, telephone_condition, disaster_information,
    telephone_number]
    excel_col_names = ['发布单位', '发布内容', '预警级别', '启动原因', '发布时间', '签发领导', '联系电话', '发布手段',
    '服务对象', '电话报告情况', '灾情信息', '其他']
    # ------------------------------固定信息-------------------------------------------------------
    docx_list = from_doc_to_docx(doc_path, docx_path)
    # print('*****************',docx_list)
    from_doc_to_excel(docx_list, FIXED_INFORMATION, excel_col_names)
  • 相关阅读:
    主线程——main线程
    进程和线程概念及原理
    抓取网贷之家的数据爬虫
    感知哈希算法的java实现
    最短路径—Dijkstra算法和Floyd算法
    关于图像特征提取
    hive学习之WordCount单词统计
    pig、hive以及hbase的作用
    zookeeper入门知识
    hadoop文件系统浅析
  • 原文地址:https://www.cnblogs.com/avivi/p/11354250.html
Copyright © 2020-2023  润新知