# author : aviviguixiang
# create time : 2018/12/26
import os
import docx
import pandas as pd
from win32com import client as wc
def from_doc_to_docx(doc_path, docx_path):
"""
:param doc_path: doc的路径
:param docx_path: docx的路径
:return: 返回docx的绝对路径
"""
doc_list = os.listdir(doc_path)
w = wc.Dispatch('Word.Application')
for doc in doc_list:
docx = os.path.join(docx_path, doc[:-3] + 'docx')
if not os.path.exists(docx):
doc_data = w.Documents.Open(os.path.join(doc_path, doc))
doc_data.SaveAs(docx, 16)
print('完成', doc, end=';
')
doc_data.Close()
return [os.path.join(docx_path, i) for i in os.listdir(docx_path)]
def deal_one_docx(docx_path):
"""
:param docx_path: docx 的绝对路径
:return: 返回从docx中取出的信息
"""
file = docx.Document(docx_path)
# print(dir(file)) # 查看file有什么属性方法可以用
# ---------------------------------------------------------------------------------
# TODO:处理段落对象,剔除空段落;经过处理后应该只有6个对象;
paragraphs = file.paragraphs
new_paragraphs = [i for i in paragraphs if i.text.strip()]
# ---------------------------------------------------------------------------------
# release_department = [i.strip() for i in new_paragraphs[-1].text.split(' ') if '气象局' in i][0]
release_department = '河南省驻马店市气象局'
release_time = [i.strip() for i in new_paragraphs[-1].text.split(' ') if '年' in i][0]
release_content = new_paragraphs[3].text.strip()
warning_signal = new_paragraphs[2].text.strip()
others = [i.strip()[1:-1] for i in new_paragraphs[1].text.split(' ') if '第' in i and '号' in i][0]
release_leader = [i.strip()[3:] for i in new_paragraphs[1].text.split(' ') if '签发' in i][0]
return release_department, release_content, warning_signal, release_time, release_leader, others
def from_doc_to_excel(docx_list, fixed_information, excel_col_names,
output_excel_path=r'D:1205jiejiejiejiedocconsequence.xlsx'):
"""
:param docx_list: docx文件的绝对路径
:param fixed_information: excel中不变的信息
:param excel_col_names: excel的表头
:param output_excel_path: 存excel的绝对路径
:return: None
"""
start_reason, release_channels, service_objects, telephone_condition, disaster_information, telephone_number = fixed_information
if not os.path.exists(os.path.dirname(output_excel_path)):
os.makedirs(os.path.dirname(output_excel_path))
all_consequence = []
for docx_path in docx_list:
print(os.path.basename(docx_path)) # os.path.basename(docx_path)返回路径最后的文件名
release_department, release_content, warning_signal, release_time, release_leader, others = deal_one_docx(
docx_path)
consequence = [release_department, release_content, warning_signal, start_reason, release_time, release_leader,
telephone_number, release_channels, service_objects, telephone_condition, disaster_information,
others]
all_consequence.append(consequence) # 在all_consequence中添加consequence
# print('before',all_consequence)
all_consequence.sort(key=lambda x: int(x[-1])) # 按other的编号排序
# print('end',all_consequence)
data = pd.DataFrame(columns=excel_col_names, index=range(len(all_consequence)))
for i, consequence in enumerate(all_consequence):
data.iloc[i, :] = consequence
data.to_excel(output_excel_path, index=False)
if __name__ == '__main__':
doc_path = r'D:1205jiejiejiejiedocdoc' # word所在的目录
docx_path = r'D:1205jiejiejiejiedocdocx' # 存docx的目录
# ------------------------------固定信息-------------------------------------------------------
telephone_number = '0396-2668798'
release_channels =