• 利用requests、pyquery、BeautifulSoup爬取深圳市某租房网站的租房信息


      1 import requests
      2 from requests.exceptions import RequestException
      3 from pyquery import PyQuery as pq
      4 from bs4 import BeautifulSoup
      5 import pymongo
      6 from config import *
      7 from multiprocessing import Pool
      8 import time
      9 
     10 client = pymongo.MongoClient(MONGO_URL)    # 申明连接对象
     11 db = client[MONGO_DB]    # 申明数据库
     12 
     13 def get_one_page_html(url):    # 获取网站每一页的html
     14     headers = {
     15         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
     16                       "Chrome/85.0.4183.121 Safari/537.36"
     17     }
     18     try:
     19         response = requests.get(url, headers=headers)
     20         if response.status_code == 200:
     21             return response.text
     22         else:
     23             return None
     24     except RequestException:
     25         return None
     26 
     27 
     28 def get_room_url(html):    # 获取当前页面上所有room_info的url
     29     soup = BeautifulSoup(html, 'lxml')
     30     addresses = soup.find_all('p', {'class': 'content__list--item--des'})
     31     doc = pq(html)
     32     room_urls = doc('.content__list--item--main .content__list--item--title a').items()
     33     return room_urls, addresses
     34 
     35 
     36 def parser_room_page(room_html, address_queue1, address_queue2, address_queue3):    # 对租房详情页面进行解析,获取特定信息
     37     soup = BeautifulSoup(room_html, 'lxml')
     38     pinpai = soup.find('p', {'class': 'content__aside__list--subtitle oneline'}).text.strip().split(' ')[0]
     39     price = soup.find_all('li', {'class': 'table_col'})
     40     zujin = price[6].text    # 租金
     41     yajin = price[7].text    # 押金
     42     fuwufei = price[8].text    # 服务费
     43     zhongjiefei = price[9].text    # 中介费
     44     house_type = soup.find('ul', {'class': 'content__aside__list'}).find_all('li')[1].text[5:11]    # 户型
     45     x = soup.find_all('li', {'class': 'fl oneline'})
     46     area = x[1].text[3:]  # 面积
     47     floor = x[7].text[3:]    # 楼层
     48     direction = x[2].text[3:]    # 朝向
     49     elevator = x[8].text[3:]    # 有无电梯
     50     carport = x[10].text[3:]    # 有无车位
     51     tenancy = x[18].text[3:]    # 租期
     52     maintenance = x[4].text[3:]    # 维护日期
     53     kanfang = x[21].text[3:]   # 看房是否要预约
     54     tags = soup.find('p', {'class': 'content__aside--tags'}).get_text().replace('
    ', '')    # 标签
     55 
     56     yield {
     57         'pinpai': pinpai,
     58         'zujin': zujin,
     59         'yajin': yajin,
     60         'fuwufei': fuwufei,
     61         'zhongjiefei': zhongjiefei,
     62         'house_type': house_type,
     63         'area': area,
     64         'floor': floor,
     65         'direction': direction,
     66         'elevator': elevator,
     67         'carport': carport,
     68         'tenancy': tenancy,
     69         'maintenance': maintenance,
     70         'kanfang': kanfang,
     71         'location1': address_queue1.pop(),
     72         'location2': address_queue2.pop(),
     73         'location3': address_queue3.pop(),
     74         'tags': tags,
     75     }
     76 
     77 
     78 def save_to_mongo(result):
     79     if db[MONGO_TABLE].insert_one(result):
     80         print('存储到mongodb成功', result)
     81         return True
     82     return False
     83 
     84 
     85 def main(page):
     86     url = 'http://sz.xxxxx.com/zufang/pg' + str(page) + 'rt200600000002/#contentList'
     87     html = get_one_page_html(url)
     88     room_urls, addresses = get_room_url(html)
     89     address_queue1 = []    # 采用队列数据结构,先进先出,用来存放租房区域(南山区、福田区等)
     90     address_queue2 = []
     91     address_queue3 = []    # 采用队列数据结构,先进先出,用来存放租房具体小区
     92     for address in addresses:
     93         temp = address.find_all('a')
     94         address_queue1.insert(0, temp[0].text)
     95         address_queue2.insert(0, temp[1].text)
     96         address_queue3.insert(0, temp[2].text)
     97     for room_url in room_urls:
     98         room_url_href = room_url.attr('href')
     99         room_url_href = 'http://sz.xxxxx.com/' + room_url_href
    100         room_html = get_one_page_html(room_url_href)
    101         if room_html is None:    # 非常重要,否则room_html为None时会报错
    102             pass
    103         else:
    104             # parser_room_page(room_html, address_queue1, address_queue2, address_queue3)
    105             results = parser_room_page(room_html, address_queue1, address_queue2, address_queue3)
    106             for result in results:
    107                 save_to_mongo(result)
    108 
    109 if __name__ == '__main__':
    110     time1 = time.time()
    111     pool = Pool()  # 使用多进程提高爬取效率
    112     pool.map(main, [i for i in range(1, 101)])
    113     time2 = time.time()
    114     print(time2 - time1)    # 耗时
  • 相关阅读:
    .net技巧推荐
    ASPNETPager常用属性
    带有like的存储过程
    Jquery选择器
    关于出现too many open files异常
    将ReadWriteLock应用于缓存设计
    读CopyOnWriteArrayList有感
    HttpClient容易忽视的细节——连接关闭
    windows下如何用java命令运行jar包?
    再谈重入锁ReentrantLock
  • 原文地址:https://www.cnblogs.com/chang2021/p/14073855.html
Copyright © 2020-2023  润新知