• python3 多线程 采集 xpath


    #!/usr/bin/python
    # -*- coding: UTF-8 -*-
    '''Thread3 多线程测试采集'''
    import threading,time,queue,Mongo_utils,mysqlUtils,requests,json,os
    from lxml import html
    etree = html.etree
    exitFlag = 0
    db = Mongo_utils.mongodb_15_27017task()
    table = db["xx_anjuke_agent1"]
    table_urls = db["xx_spider_urls1"]
    list_pro = mysqlUtils.select_pro()
    list_urls = table_urls.find().limit(2000)
    insert_list = []
    del_list = []
    class myThread(threading.Thread):
        def __init__(self,threadId,name,q):
            threading.Thread.__init__(self)
            self.threadId = threadId
            self.name = name
            self.q = q
    
        def run(self):
            print("开始线程" + self.name)
            spider(self.name,self.q)
            print("退出线程" + self.name)
    def head():
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "cache-control": "max-age=0",
            "upgrade-insecure-requests": "1",
            "Connection": "keep-alive",
            "Content-Type": "text/html; charset=UTF-8",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
        }
        return headers
    def spider(name,q):
        while not exitFlag:
            queueLock.acquire()
            if not workQueue.empty():
                i = q.get()
                queueLock.release()
                _id = i["_id"]
                city = i["city"]
                zone = i["zone"]
                street = i["street"]
                urls = i["urls"]
                headers = head()
                try:
                    url = "https://。。。。。。。。。。。" % _id
                    # //,proxies=proxy
                    response_contact = requests.session().get(url=url, allow_redirects=False, headers=headers,
                                                              timeout=1)
                    print(response_contact.status_code)
                    if response_contact.status_code == 302:
                        print("验证")
                        print(url)
                        os._exit(0)
                    res = json.loads(response_contact.text)
    
                    contact = res['data']
                    response_dl = requests.session().get(url=urls, allow_redirects=False, headers=headers,
                                                         timeout=1)
                    if response_dl.status_code == 302:
                        print("验证")
                        print(urls)
                        os._exit(0)
                    if ("获取成功") not in response_contact.text or ("房屋编码") not in response_dl.text:
                        print("pass")
                        pass
                    html = etree.HTML(response_dl.content)
                    name = html.xpath("//div[@class='brokercard-name']/text()")[0].strip()
                    company = html.xpath("//div[@class='broker-company']/p[1]/a/text()")[0]
                    company_url = html.xpath("//div[@class='broker-company']/p[1]/a/@href")[0]
                    store = html.xpath("//div[@class='broker-company']/p[2]/span/text()")[0]
                    # re = name, company, company_url, store, contact,_id,city,zone,street
                    staffNo = "https://anjuxingye1.anjuke.com/gongsi-jjr-%s/" % _id
                    mydict = {"_id": _id, "city": city, "zone": zone, "street": street, "name": name, "company": company,
                              "company_url": company_url,
                              "store": store, "site": "anjuke", "store_url": "", "staffNo": "", "store_url": "",
                              "staffNo": staffNo, "tag": "8", "all_comm": ""
                        , "contact": contact}
    
                    insert_list.append(mydict)
                    # del_list.append(urls)
                    print("size: %s" % insert_list.__len__())
                except:
                    pass
                print("%s processing %s" % (name, i))
            else:
                queueLock.release()
        # time.sleep(1)
    
    threadList = range(0,5)
    queueLock = threading.Lock()
    workQueue = queue.Queue(50000)
    threads = []
    threadID = 1
    for tName in threadList:
        thread = myThread(threadID, tName, workQueue)
        thread.start()
        threads.append(thread)
        threadID += 1
    
    # 填充队列
    queueLock.acquire()
    for word in list_urls:
        workQueue.put(word)
    queueLock.release()
    
    # 等待队列清空
    while not workQueue.empty():
        pass
        if insert_list.__len__() > 10:
            try:
                table.insert_many(insert_list, ordered=False)
                # table_urls.remove({"urls": {"$in": del_list}})
                print("插入1000")
            except Exception as e:
                print(e)
            insert_list.clear()
            del_list.clear()
    # 通知线程是时候退出
    # os._exit(0)
    exitFlag = 1
    try:
        table.insert_many(insert_list, ordered=False)
        # table_urls.remove({"urls": {"$in": del_list}})
        print("插入1000")
    except:
        pass
    insert_list.clear()
    del_list.clear()
    # 等待所有线程完成
    for t in threads:
        t.join()
    print ("退出主线程")
  • 相关阅读:
    wso2 CEP集成storm实验
    mybatis的decimal精度缺失
    计算时间偏移量小工具
    Blob写入文件
    java父子进程通信
    log4j2配置MDC分线程写日志
    结构体
    局部变量与全局变量
    ARM漏洞
    ARM承认芯片漏洞:披露修复细节
  • 原文地址:https://www.cnblogs.com/tnsay/p/11766827.html
Copyright © 2020-2023  润新知