• python docx通过关键字标注字体以及颜色大小等


    主要使用python-docx 与pandas

    因为python-docx对表格的解析不够友好且效率低,故需转换一次

    代码如下

    # coding:utf-8
    import os, re
    import docx
    from docx.document import Document as dc
    from docx.oxml.table import CT_Tbl
    from docx.oxml.text.paragraph import CT_P
    from docx.table import _Cell, Table
    from docx.text.paragraph import Paragraph
    from docx.shared import RGBColor  # 设置字体颜色
    from docx import Document
    from docx.shared import Pt  # 设置字体
    from docx.oxml.ns import qn  # 设置中文字体
    import pandas as pd
    
    FILE_PATH = r"D:xxxxxxxxxxxxxxxx.docx"
    
    obj = docx.Document(FILE_PATH)
    
    
    def iter_block_items(parent):
        # print('utils.py ----> iter_block_items:', 2)
        if isinstance(parent, dc):
            parent_elm = parent.element.body
        elif isinstance(parent, _Cell):
            parent_elm = parent._tc
        else:
            raise ValueError("[TypeError] Document in insuitable type.")
    
        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, parent)
            elif isinstance(child, CT_Tbl):
                yield Table(child, parent)
    
    
    def table2list(table):
        data = []
        for i, row in enumerate(table.rows):
            row_data = []
            for cell in row.cells:
                row_data.append(cell.text)
            data.append(row_data)
        return data
    
    #替换的段落关键字
    word = '段落关键字'
    #替换的表格关键字
    table_text = '表格关键字'
    
    
    def set_run(run, font_size, bold, color, name):
        '''
        设置run对象
        :param run:
        :param font_size: 字体大小
        :param bold: 是否加粗
        :param color: 字体颜色
        :param name: 字体名
        :return:
        '''
        run.font.size = font_size
        run.bold = bold
        run.font.color.rgb = color
        run.font.name = name
        # 设置字体必须要下面2步
        s = run._element
        s.rPr.rFonts.set(qn('w:eastAsia'), name)
    
    
    def paragraphs_utils(obj):
        for p in obj.paragraphs:
            # 先循环得到单个段落p
            for r in p.runs:
                if word not in r.text:
                    # 判断关键字是否存在于段落文本中
                    continue
                # print(r.text)
                # print(r.style.name)
                font_size = r.font.size
                bold = r.bold
                color = r.font.color.rgb
                name = u'楷体'
                # 使用关键词切分当前run的文本
                rest = r.text.split(word)
                # 清除当前run的内容
                r.text = ''
                for text in rest[:-1]:
                    # 循环切割出来的列表 ['','xxxxxxx']或者['xxxxx','']
                    run = p.add_run(text=text)
                    set_run(run, font_size, bold, color, name)
                    run = p.add_run(word)
                    # 重写关键字部分
                    set_run(run, font_size, bold, color, name)
                    run.font.color.rgb = RGBColor(255, 0, 0)
                run = p.add_run(rest[-1])
                # 在补齐r.text的内容
                set_run(run, font_size, bold, color, name)
        obj.save('标注后的文档.docx')
    
    
    def table_utils(obj):
        for p in obj.tables:
            # 先循环得到单个表格p
            pd_block = pd.DataFrame(table2list(p))
            # 使用table2list 将table转成列表,然后转成pandas的DateFrame对象
            for rows in range(pd_block.shape[0]):
                # 循环pd_block(DateFrame对象)的行数 -》shape方法得到元祖 为行数和列数
                if rows == 0: continue
                if table_text != pd_block.iloc[rows, 0]: continue
                # 判断关键字是否等于当前表的 rows行0列,否则跳过
                for cols in range(pd_block.shape[1]):
                    if cols == 0: continue
                    rs = p.cell(rows, cols).paragraphs[0]
                    # 此时rows和cols肯定为关键字所在的那行数据,用document对象获取paragraphs取0
                    for r in rs.runs:  # paragraphs中有个runs   是个列表
                        font_size = r.font.size
                        bold = r.bold
                        color = r.font.color.rgb
                        name = u'楷体'
                        data = r.text.strip()
                        # 清除当前run的内容
                        r.text = ''
                        run = rs.add_run(data)
                        # 此时要使用paragraphs的add_run方法重写data数据
                        set_run(run, font_size, bold, color, name)
                        run.font.color.rgb = RGBColor(255, 0, 0)
        obj.save('标注后的表格.docx')
    
    
    for block in iter_block_items(obj):
        if isinstance(block, Paragraph):
            for r in block.runs:
                if word not in r.text:
                    continue
                print(r.text)
                print(r.style.name)
                font_size = r.font.size
                bold = r.bold
                color = r.font.color.rgb
                name = u'楷体'
                # 使用关键词切分当前run的文本
                rest = r.text.split(word)
                # 清除当前run的内容
                r.text = ''
                for text in rest[:-1]:
                    run = block.add_run(text=text)
                    set_run(run, font_size, bold, color, name)
                    run = block.add_run(word)
                    set_run(run, font_size, bold, color, name)
                    run.font.color.rgb = RGBColor(255, 0, 0)
                run = block.add_run(rest[-1])
                set_run(run, font_size, bold, color, name)
        else:
            pd_block = pd.DataFrame(table2list(block))
            # 使用table2list 将table转成列表,然后转成pandas的DateFrame对象
            for rows in range(pd_block.shape[0]):
                # 循环pd_block(DateFrame对象)的行数 -》shape方法得到元祖 为行数和列数
                if rows == 0: continue
                if table_text != pd_block.iloc[rows, 0]: continue
                # 判断关键字是否等于当前表的 rows行0列,否则跳过
                for cols in range(pd_block.shape[1]):
                    if cols == 0: continue
                    rs = block.cell(rows, cols).paragraphs[0]
                    # 此时rows和cols肯定为关键字所在的那行数据,用document对象获取paragraphs取0
                    for r in rs.runs:  # paragraphs中有个runs   是个列表
                        font_size = r.font.size
                        bold = r.bold
                        color = r.font.color.rgb
                        name = u'楷体'
                        data = r.text.strip()
                        # 清除当前run的内容
                        r.text = ''
                        run = rs.add_run(data)
                        # 此时要使用paragraphs的add_run方法重写data数据
                        set_run(run, font_size, bold, color, name)
                        run.font.color.rgb = RGBColor(255, 0, 0)
                        
    obj.save('段落与表格标注后的文档.docx')

    匹配关键字回写docx替换颜色

  • 相关阅读:
    我的收藏:第一章:汇总
    实战:第二章:关于EZDML工具使用踩的坑
    Docker:第二章:部署项目,对镜像,容器的操作
    电商网站项目:第一章:进度
    多线程:第一章:我(线程)这一生
    性能优化:第一章:我们应该怎么优化
    关于EZDML数据库表结构制作设计工具使用踩的坑
    集群与分布的区别
    strip_tags--php
    关于html的一些杂技
  • 原文地址:https://www.cnblogs.com/nixindecat/p/12157623.html
Copyright © 2020-2023  润新知