API: http://python-docx.readthedocs.io/en/latest/#api-documentation
1.将doc转为docx
python3.8中win32com 要安装pypiwin32 pip install pypiwin32
from win32com import client as wc
word = wc.Dispatch("Word.Application")
doc = word.Documents.Open(路径+名称.doc)
doc.SaveAs(路径+名称.docx, 12) 12为docx
doc.Close()
word.Quit()
2.读取段落
import docx
docStr = Document(docName) 打开文档
for paragraph in docStr.paragraphs:
parStr = paragraph.text
--》paragraph.style.name == 'Heading 1' 一级标题
--》paragraph.paragraph_format.alignment == 1 居中显示
--》paragraph.style.next_paragraph_style.paragraph_format.alignment == 1 下一段居中显示
--》paragraph.style.font.color
3.读取表格
numTables = docStr.tables
for table in numTables:
#行列个数
row_count = len(table.rows)
col_count = len(table.columns)
for i in range(row_count):
row = table.rows[i].cells
i行j列内容:row[j].text
或者:
row_count = len(table.rows)
col_count = len(table.columns)
for i in range(row_count):
for j in range(col_count):
print(table.cell(i,j).text)
4.按样式读取
读取标题
for p in doc.paragraphs:
if p.style.name=='Heading 1':
print(p.text)
import re
for p in doc.paragraphs:
if re.match("^Heading d+$",p.style.name):
print(p.text)
读取正文
for p in doc.paragraphs:
if p.style.name=='Normal':
print(p.text)
5.获取docx支持的样式
from docx.enum.style import WD_STYLE_TYPE
for i in s:
if i.type==WD_STYLE_TYPE.PARAGRAPH:
print(i.name)
6.设置首行缩进
from docx.shared import Inches,Pt
par2 = doc.add_paragraph('段落文本')
# 左缩进,0.5 英寸
par2.paragraph_format.left_indent = Inches(0.5)
# 右缩进,20 磅
par2.paragraph_format.right_indent = Pt(20)
# 首行缩进
par2.paragraph_format.first_line_indent = Inches(1)
查看首行缩进单位
from docx import Document
from docx.shared import Inches
from docx.oxml.ns import qn
from docx.shared import Cm, Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import RGBColor
myDocument = Document('2020年建交集团3月分析报告.docx')
for paragraph in myDocument.paragraphs:
print(paragraph.paragraph_format.first_line_indent)
print(dir(paragraph))
https://blog.csdn.net/weixin_45903952/article/details/106200213
https://blog.csdn.net/zhouz92/article/details/107028727?utm_medium=distribute.pc_aggpage_search_result.none-task-blog-2allfirst_rank_v2~rank_v25-18-107028727.nonecase&utm_term=python%20%E6%AE%B5%E8%90%BD%E8%AE%BE%E7%BD%AE&spm=1000.2123.3001.4430
https://blog.csdn.net/xtfge0915/article/details/83479922