• 爬虫入门之线程进程协程抓取方法(八)


    1 多线程抓取

    
    import lxml
    from lxml import etree
    import requests
    import threading
    import time
    
    rlock = threading.RLock()  # 递归锁
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
    
    def getArea(url):
        '''
        获取区域名和链接
        :param url: 种子
        :return:
        '''
        response = requests.get(url, headers=headers).text
        mytree = lxml.etree.HTML(response)
    
        areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
        # 存储地址和链接
        areaDict = {}
        for area in areaList:
            # 区名
            areaName = area.xpath('./text()')[0]
            # url
            areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
            print(areaName, areaurl)
            # 西湖 https://hz.lianjia.com/ershoufang/xihu/  将其变成字典
            areaDict[areaName] = areaurl
        return areaDict
    
    def gethouseInfo(areaName, url):
        '''
        获取房子信息
        :param areaname: 地区名
        :param url: 区域的url
        :return:
        '''
        response = requests.get(url, headers=headers).text
        mytree = lxml.etree.HTML(response)
    
        sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class="clear"]")
        for house in sellList:
            # 概述
            title = house.xpath('.//div[@class="title"]/a/text()')[0]
            # url
            houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
            # 房子信息
            houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + 
                        house.xpath('.//div[@class="houseInfo"]/text()')[0]
    
            # 位置信息
            positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + 
                           house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
    
            # 总价
            # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
            totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
            # 平方价
            unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
            # print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)
    
            with rlock:
                print(areaName)
                with open(areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
                    f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '
    ')
                    f.flush()
    
    if __name__ == '__main__':
        starUrl = "https://hz.lianjia.com/ershoufang/"
        areaDict = getArea(starUrl)
        time.clock()
        print(areaDict)
        # 多线程
        threadList = []
        for areaName, url in areaDict.items():
            t = threading.Thread(target=gethouseInfo, args=(areaName, url))
            # 开启
            threadList.append(t)
            t.start()
     
        # 保证线程都结束
        for i in threadList:
            i.join()
        print(time.clock())
    
    

    2 多协程抓取

    import gevent
    from gevent import monkey
    gevent.monkey.patch_all()   #有些需要刚开始进行初始化
    import lxml
    from lxml import etree
    import requests
    import threading
    import time
    
    rlock = threading.RLock()  # 递归锁
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
    
    def getArea(url):
        '''
        获取区域名和链接
        :param url: 种子
        :return:
        '''
        response = requests.get(url, headers=headers).text
        mytree = lxml.etree.HTML(response)
        areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
        # 存储地址和链接
        areaDict = {}
        for area in areaList:
            # 区名
            areaName = area.xpath('./text()')[0]
            # url
            areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
            print(areaName, areaurl)
            areaDict[areaName] = areaurl
        return areaDict
    
    def gethouseInfo(areaName, url):
        '''
        获取房子信息
        :param areaname: 地区名
        :param url: 区域的url
        :return:
        '''
        response = requests.get(url, headers=headers).text
        mytree = lxml.etree.HTML(response)
        sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class="clear"]")
        for house in sellList:
            # 概述
            title = house.xpath('.//div[@class="title"]/a/text()')[0]
            # url
            houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
            # 房子信息
            houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + 
                        house.xpath('.//div[@class="houseInfo"]/text()')[0]
    
            # 位置信息
            positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + 
                           house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
    
            # 总价
            # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
            totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
            # 平方价
            unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
            # print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)
    
            with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
                f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '
    ')
                f.flush()
    
    if __name__ == '__main__':
        starUrl = "https://hz.lianjia.com/ershoufang/"
        areaDict = getArea(starUrl)
        time.clock()
        print(areaDict)
        # 多协程
        # gevent.monkey.patch_all()  # 非阻塞io  如果此处不行则需要在最上方导入
        geventList = []
        for k, v in areaDict.items():
            g = gevent.spawn(gethouseInfo, k, v)
            geventList.append(g)
        gevent.joinall(geventList)
        print(time.clock())
    
    

    3 多进程抓取

    import lxml
    from lxml import etree
    import requests
    
    import multiprocessing
    import time
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
    
    def getArea(url):
        '''
        获取区域名和链接
        :param url: 种子
        :return:
        '''
        response = requests.get(url, headers=headers).text
        mytree = lxml.etree.HTML(response)
        areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
        # 存储地址和链接
        areaDict = {}
        for area in areaList:
            # 区名
            areaName = area.xpath('./text()')[0]
            # url
            areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
            print(areaName, areaurl)
            areaDict[areaName] = areaurl
        return areaDict
    
    def gethouseInfo(areaName, url):
        '''
        获取房子信息
        :param areaname: 地区名
        :param url: 区域的url
        :return:
        '''
        response = requests.get(url, headers=headers).text
        mytree = lxml.etree.HTML(response)
        sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class="clear"]")
        for house in sellList:
            # 概述
            title = house.xpath('.//div[@class="title"]/a/text()')[0]
            # url
            houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
            # 房子信息
            houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + 
                        house.xpath('.//div[@class="houseInfo"]/text()')[0]
    
            # 位置信息
            positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + 
                           house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
    
            # 总价
            # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
            totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
            # 平方价
            unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
    
            with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
                f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '
    ')
                f.flush()
    
    if __name__ == '__main__':
        starUrl = "https://hz.lianjia.com/ershoufang/"
        areaDict = getArea(starUrl)
        time.clock()
        print(areaDict)
        # 多进程
        processList = []
        for areaName, url in areaDict.items():
            t = multiprocessing.Process(target=gethouseInfo, args=(areaName, url)) #开启多进程
            # 开启
            processList.append(t)
            t.start()
    
        # 保证线程都结束
        for i in processList:
            i.join()
        print(time.clock())
    
    

    4 多线程加协程

    import gevent
    from gevent import monkey
    gevent.monkey.patch_all()
    import json
    
    import lxml
    from lxml import etree
    import requests
    import threading
    import time
    
    rlock = threading.RLock()  # 递归锁
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
    
      # 非阻塞IO
    def getArea(url):
        '''
        获取区域名和链接
        :param url: 种子
        :return:
        '''
        response = requests.get(url, headers=headers).text
        mytree = lxml.etree.HTML(response)
        areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
        # 存储地址和链接
        areaDict = {}
        for area in areaList:
            # 区名
            areaName = area.xpath('./text()')[0]
            # url
            areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
            print(areaName, areaurl)
            areaDict[areaName] = areaurl
        return areaDict
    
    def gethouseInfo(areaName, url):
        '''
        获取房子信息
        :param areaname: 地区名
        :param url: 区域的url
        :return:
        '''
        response = requests.get(url, headers=headers).text
        mytree = lxml.etree.HTML(response)
        sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class="clear"]")
        for house in sellList:
            # 概述
            title = house.xpath('.//div[@class="title"]/a/text()')[0]
            # url
            houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
            # 房子信息
            houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + 
                        house.xpath('.//div[@class="houseInfo"]/text()')[0]
    
            # 位置信息
            positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + 
                           house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
    
            # 总价
            # /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
            totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
            # 平方价
            unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
            # print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)
    
            with rlock:
                print(areaName)
                with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
                    f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '
    ')
                    f.flush()
    
    def getPageNum(areaName, url):
        '''
        获取当前页面
        '''
        response = requests.get(url, headers=headers).text
        mytree = lxml.etree.HTML(response)
        pageNum = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
        pageNum = json.loads(pageNum)  # json数据
        pageNum = pageNum['totalPage']
    
        geventList = []
        for i in range(1, int(pageNum) + 1):
            newurl = url + "pg%d/" % i
            g = gevent.spawn(gethouseInfo, areaName, newurl)
            geventList.append(g)
        gevent.joinall(geventList)
    
    if __name__ == '__main__':
        starUrl = "https://hz.lianjia.com/ershoufang/"
        areaDict = getArea(starUrl)
        time.clock()
        print(areaDict)
        # 多线程
        threadList = []
        for areaName, url in areaDict.items():
            t = threading.Thread(target=getPageNum, args=(areaName, url))
            # 开启
            threadList.append(t)
            t.start()
    
        # 保证线程都结束
        for i in threadList:
            i.join()
    
        print(time.clock())
    
    

    5 多进程加协程

    
    import gevent
    from gevent import monkey
    gevent.monkey.patch_all()
    import json
    
    import lxml
    from lxml import etree
    import requests
    import multiprocessing
    import time
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
    
      # 非阻塞IO
    def getArea(url):
        '''
        获取区域名和链接
        :param url: 种子
        :return:
        '''
        response = requests.get(url, headers=headers).text
        mytree = lxml.etree.HTML(response)
    
        areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
        # 存储地址和链接
        areaDict = {}
        for area in areaList:
            # 区名
            areaName = area.xpath('./text()')[0]
            # url
            areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
            print(areaName, areaurl)
            areaDict[areaName] = areaurl
        return areaDict
    
    def gethouseInfo(areaName, url):
        '''
        获取房子信息
        :param areaname: 地区名
        :param url: 区域的url
        :return:
        '''
        response = requests.get(url, headers=headers).text
        mytree = lxml.etree.HTML(response)
    
        sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class="clear"]")
        for house in sellList:
            # 概述
            title = house.xpath('.//div[@class="title"]/a/text()')[0]
            # url
            houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
            # 房子信息
            houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] + 
                        house.xpath('.//div[@class="houseInfo"]/text()')[0]
            # 位置信息
            positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] + 
                           house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
            # 总价
            totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
            # 平方价
            unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
            print(areaName)
            with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
                f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '
    ')
                f.flush()
    
    
    def getPageNum(areaName, url):
    
        response = requests.get(url, headers=headers).text
        mytree = lxml.etree.HTML(response)
        pageNum = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
        pageNum = json.loads(pageNum)  # json数据
        pageNum = pageNum['totalPage']
        
        geventList = []
        for i in range(1, int(pageNum) + 1):
            newurl = url + "pg%d/" % i
            g = gevent.spawn(gethouseInfo, areaName, newurl)
            geventList.append(g)
        gevent.joinall(geventList)
    
    if __name__ == '__main__':
        starUrl = "https://hz.lianjia.com/ershoufang/"
        areaDict = getArea(starUrl)
        time.clock()
        print(areaDict)
        # 多线程
        processList = []
        for areaName, url in areaDict.items():
            # 开启多进程
            p = multiprocessing.Process(target=getPageNum,args=(areaName, url))
            processList.append(p)
            p.start()
    
        # 保证进程都结束
        for i in processList:
            i.join()
        print(time.clock())
    
    
  • 相关阅读:
    sed与正则用法收集
    第一学期网络技术知识总汇
    常用的windows注册表大全
    1-1 Linux系统安装
    用javascript写计算器
    closure
    Linux-Rhel6 恢复误删文件
    flask第十一篇——自定义url转换器
    flask第十篇——url_for【3】
    flask第九篇——url_for【2】
  • 原文地址:https://www.cnblogs.com/why957/p/9252496.html
Copyright © 2020-2023  润新知