• 爬虫----配合多线程的思路


    from pyquery import PyQuery as pq
    import os
    from queue import Queue
    from threading import Thread
    class txtparser(Thread):
        def __init__(self,queue):
            Thread.__init__(self)
            self.queue = queue
            #文件夹目录
    
        def run(self):
            #path = "E:辰东heTian\395020.html"
            while True:
                content = self.queue.get()
                html=""
                try:
                    with open (content,"r",encoding='utf-8') as reader:
                         html = reader.read()
                except Exception:
                    with open (content,"r") as reader:
                         html = reader.read()
                #print(html)
                try:
                    doc = pq(html)
                    title = doc("#main .content_read .box_con .bookname h1")
                    print("标题=====",title.text())
                    clipname = content.split("\")[-2]
                    #junkp = doc(".content").find('p').remove()
                    passage = doc("#content").text()
                except Exception:
                    continue
                print("正文======",str.replace(passage,"<br/>",""))
                try:
                    clipname = str.replace(clipname,"","")
                    clipname = str.replace(clipname,"","")
                except Exception:
                    clipname = clipname
                if os.path.exists(clipname):
                    pass
                else:
                    os.mkdir(clipname)
                try:
                    with open(clipname+"\"+title.text()+".txt","w",encoding="gbk") as writer:
                        writer.write(passage)
                    print("完成{}的写入".format(clipname+"\"+title.text()+".txt"))
                except Exception:
                    with open("errorecorder.log","a") as writer:
                        writer.write(clipname+"\"+title.text()+".txt"+"
    ")
                print("文件夹名称======",clipname)
    
    def launchtxtparser(parentdir):
        rootdir = parentdir
        queue = Queue()
        print(rootdir)
        for i in os.listdir(rootdir):
            print(i)
            if os.path.isdir(rootdir+"\"+i):
                print(rootdir+"\"+i)
                g = (k for k in os.listdir(rootdir+"\"+i))
                print(next(g))
                while True:
                     try:
                         filename = next(g)
                         fullfilename = rootdir+"\"+i+"\"+filename
                         queue.put(fullfilename)
                         print(fullfilename)
                     except StopIteration:
                         print("ooooophs~处理完毕")
                         break
        for i in range(10):
            cpc = txtparser(queue)
            cpc.daemon=True
            cpc.start()
        queue.join()
    #print(os.listdir(rootdir))
    launchtxtparser("E:月关")
  • 相关阅读:
    Nacos和Eureka的区别以及选型
    将word中的标题和正文按照大纲等级导入到excel中
    jeecgboot启动gateway找不到路由的问题(跨域访问的问题)
    单点登录的整理
    使用批处理文件(.bat)同时启动多个CMD服务
    安装nacos启动报错以及版本对应关系问题解决
    关于Spring事务管理 之 默认事务间调用问题
    octopus 为梦而生的八爪鱼
    我太难了
    节日快乐
  • 原文地址:https://www.cnblogs.com/saintdingspage/p/10582296.html
Copyright © 2020-2023  润新知