• 用python实现一个文档小工具(支持文档关键字筛选)


    功能:根据关键词批量从doc、docx、pdf文件中筛选出包含所输入关键词的文件

    那么开始上代码,不是专业python程序猿,代码写的不好勿喷,哈哈

    from PyQt5.QtWidgets import *
    from PyQt5.QtGui import *
    from PyQt5.QtCore import *
    import sys, os
    import docx
    from docx import Document
    import os
    import shutil
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LAParams
    from pdfminer.pdfparser import PDFParser, PDFDocument
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfdevice import PDFDevice
    
    class Window(QDialog):
        def __init__(self, parent=None):
            super(Window, self).__init__(parent)
            self.path = ''
            self.initUI()
            self.setWindowTitle("文件小助手")
            self.resize(240, 200)
    
        def initUI(self):
            grid = QGridLayout()
    
            grid.addWidget(QLabel("源路径:"), 0, 0)
            self.pathLineEdit = QLineEdit()
            self.pathLineEdit.setFixedWidth(200)
            self.pathLineEdit.setText(self.path)
            grid.addWidget(self.pathLineEdit, 0, 1)
            button = QPushButton("选择文件夹")
            grid.addWidget(button, 0, 3)
            button.clicked.connect(self.msg)
    
            grid.addWidget(QLabel("输出路径:"), 1, 0)
            self.pathLineEdit1 = QLineEdit()
            self.pathLineEdit1.setFixedWidth(200)
            self.pathLineEdit1.setText(self.path)
            grid.addWidget(self.pathLineEdit1, 1, 1)
            button = QPushButton("选择文件夹")
            grid.addWidget(button, 1, 3)
            button.clicked.connect(self.msg1)
    
            # create textbox
            grid.addWidget(QLabel("关键字:"), 2, 0)
            self.textbox = QLineEdit(self)
            self.textbox.move(20, 20)
            self.textbox.resize(180, 30)
            grid.addWidget(self.textbox, 2, 1)
    
            # Create a button in the window
            self.button1 = QPushButton('点我开始干活儿', self)
            grid.addWidget(self.button1, 3, 1)
            self.setLayout(grid)
            fileDir = self.pathLineEdit.text()
            keyword = self.textbox.text()
            self.button1.clicked.connect(lambda : self.working(self.pathLineEdit,self.pathLineEdit1,self.textbox))
    
        def msg(self):
            dir = QFileDialog.getExistingDirectory(self,"选取文件夹","./")  # 起始路径
            self.pathLineEdit.setText(dir)
            print(dir)
    
        def msg1(self):
            dir = QFileDialog.getExistingDirectory(self, "选取文件夹", "./")  # 起始路径
            self.pathLineEdit1.setText(dir)
            print(dir)
    
        #word 解析器
        def readDoc(self,root,path,target,key):
            #将doc文件改为docx
            filename = path[-3:]
            if filename == 'doc':
                name = os.path.basename(path)
                os.rename(path,root+'/'+name+'x')
                path = path+'x'
    
            flag = False
            try:
                document = Document(path)
            except:
                return
            else:
                for paragraph in document.paragraphs:
                    if key in paragraph.text:
                        flag = True
                        self.copyFile(target,path)
                        break
                if flag == False:
                    tables = document.tables
                    for table in tables:
                        # 行列个数
                        row_count = len(table.rows)
                        col_count = len(table.columns)
                        for i in range(row_count):
                            for j in range(col_count):
                                if key in table.cell(i, j).text:
                                    self.copyFile(target, path)
                                    break
    
        #pdf文件解析器
        def readPdf(self,root,path,target,key):
            # 获取文档对象
            fp = open(path, "rb")
    
            # 创建一个一个与文档关联的解释器
            parser = PDFParser(fp)
    
            # PDF文档的对象
            doc = PDFDocument()
    
            # 连接解释器和文档对象
            parser.set_document(doc)
            doc.set_parser(parser)
    
            # 初始化文档,当前文档没有密码,设为空字符串
            doc.initialize("")
    
            # 创建PDF资源管理器
            resource = PDFResourceManager()
    
            # 参数分析器
            laparam = LAParams()
    
            # 创建一个聚合器
            device = PDFPageAggregator(resource, laparams=laparam)
    
            # 创建PDF页面解释器
            interpreter = PDFPageInterpreter(resource, device)
    
            # 使用文档对象得到页面的集合
            for page in doc.get_pages():
                # 使用页面解释器读取
                interpreter.process_page(page)
    
                # 使用聚合器来获得内容
                layout = device.get_result()
    
                for out in layout:
                    if hasattr(out, "get_text"):
                        txt = out.get_text()
                        if key in txt:
                            self.copyFile(target,path)
                            break
    
        # 复制文件
        def copyFile(self, path, oldname):
            hasFile = os.path.exists(path)
            if hasFile == True:
                name = os.path.basename(oldname)
                shutil.copyfile(oldname, path + '/' + name)
            else:
                os.mkdir(path)
                name = os.path.basename(oldname)
                shutil.copyfile(oldname, path + '/' + name)
    
        # 开始干活儿
        @pyqtSlot()
        def working(self,pathLineEdit1,pathLineEdit2,textbox):
            sourcedir = pathLineEdit1.text()
            targetdir = pathLineEdit2.text()
            key = textbox.text()
            msg = '处理好了'
            if sourcedir.strip() == '':
                msg = '源路径不能为空'
                QMessageBox.question(self, "Message", msg,
                                     QMessageBox.Ok, QMessageBox.Ok)
                return
            if targetdir.strip() == '':
                msg = '输出路径不能为空'
                QMessageBox.question(self, "Message", msg,
                                     QMessageBox.Ok, QMessageBox.Ok)
                return
            if key.strip() == '':
                msg = '关键字不能为空'
                QMessageBox.question(self, "Message", msg,
                                     QMessageBox.Ok, QMessageBox.Ok)
                return
            # 处理文件
            if sourcedir.strip() != '' and targetdir.strip() != '' and key.strip() != '':
                flag = False
                for root, dirs, files in os.walk(sourcedir):
                    for file in files:
                        diricto = os.path.join(root, file)
                        filetype = diricto[-4:]
                        if 'doc' in filetype:
                            self.readDoc(root, diricto, targetdir, key)
                            flag = True
                        if 'pdf' in filetype:
                            self.readPdf(root, diricto, targetdir, key)
                            flag = True
    
                if flag == False :
                    msg = '源路径中没有word和pdf文件'
                QMessageBox.question(self, "Message", msg,
                                     QMessageBox.Ok, QMessageBox.Ok)
    
    if __name__ == '__main__':
        app = QApplication(sys.argv)
        dialog = Window()
        if dialog.exec_():
            pass

    工具演示效果图如下

    工具下载链接:  https://pan.baidu.com/s/1w7CQUAowSgR_d6V2h5OlwA  密码:kyuy


    文末小福利免费视频资源网站:www.sousuohou.com
  • 相关阅读:
    概率dp——cf148D
    概率dp——处理分母为0的情况hdu3853
    概率dp的迭代方式小结——zoj3329,hdu4089,hdu4035
    概率dp——hdu4089推公式+循环迭代
    概率dp——期望水题hdu4405
    概率dp——逆推期望+循环迭代zoj3329
    单调栈——cf777E
    springMVC 返回类型选择 以及 SpringMVC中model,modelMap.request,session取值顺序
    spring MVC、mybatis配置读写分离
    Spring 实现数据库读写分离
  • 原文地址:https://www.cnblogs.com/vicF/p/9803566.html
Copyright © 2020-2023  润新知