• 爬虫_房多多(设置随机数反爬)


    本来想正面刚一下这个验证码的,但是一直post不上去,只好设置随机延迟,防止反爬

    fangdd.py

      1 import requests
      2 from lxml import etree
      3 import re
      4 from check_code import *    #验证码处理模块
      5 from get_pinyin import *    #汉子转拼音模块
      6 from save_to_mongo import *
      7 import time
      8 import random
      9 
     10 
     11 class Fangdd():
     12     def __init__(self):
     13         user_agent_list = [
     14             "Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
     15             "Mozilla/4.0(compatible;MSIE6.0;WindowsNT5.1)",
     16             "Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11",
     17             "Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11",
     18             "Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)",
     19             "Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Trident/4.0;SE2.XMetaSr1.0;SE2.XMetaSr1.0;.NETCLR2.0.50727;SE2.XMetaSr1.0)"
     20         ]
     21         #产生一个随机user-agent
     22         self.headers={
     23             #从上面的列表上随机取一个
     24             "User-Agent":random.choice(user_agent_list)
     25         }
     26         # self.c = Code_text()
     27         # self.rand_time = random.randint(1, 5)
     28 
     29     def get_html(self, url):
     30         # 请求网页源代码
     31         response = requests.get(url, headers=self.headers)
     32         html = response.text
     33         element = etree.HTML(html)
     34         # 根据title判断是否反爬
     35         title = element.xpath('//title/text()')
     36         title = title[0]
     37         if title == 'chech captcha':
     38             print('有内鬼,终止交易(爬虫已被发现,正在进行验证码处理)')
     39             # self.c.post_code()
     40             time.sleep(5)
     41         else:
     42             return html
     43 
     44 
     45     def get_location(self, html):
     46         # 得到全国各地地名
     47         addresses = []
     48         element = etree.HTML(html)
     49         lis = element.xpath('//div[@class="q3rm0"]/li[position()>1]//a/text()')
     50         for li in lis:
     51             # 得到的是汉字,而网址中是拼音,转换一下
     52             li_pinyin = get_pinyin(li)
     53             addresses.append(li_pinyin)
     54         return addresses
     55 
     56 
     57     def get_all_url(self, addresses):
     58         # 对每一个地名的网址进行解析
     59         urls = []
     60         for address in addresses:
     61             addr_url = 'https://%s.fangdd.com/xiaoqu' % address
     62             urls.append(addr_url)
     63         return urls
     64             
     65 
     66     def parse_list_page(self, urls):
     67         not_found = []  #网址中的拼音不全是汉字转换的,无法统一抓取
     68         for p_url in urls:
     69             # 设置随机数进行睡眠,防止反爬
     70             # time.sleep(self.rand_time)
     71             html = self.get_html(p_url)
     72             element = etree.HTML(html)
     73             # 多音字转换报错,根据title内容来确定
     74             title = element.xpath('//title/text()')
     75             title = title[0]
     76             if title == '很抱歉!您访问的页面不存在!':
     77                 # 因为有的地区名字过长,网址中采用省略写法,所以根据拼音拼接的网址与真实网址不对应,导致404
     78                 print('由于拼音转换与网址不对应,从列表中删除该网址:%s' % p_url)
     79             else:
     80                 print(title)
     81                 # 当小区数量不足20时,只有一页,最大页数为1
     82                 max_xiaoqu = element.xpath('//p[@class="filter-result"]/b/text()')
     83                 max_xiaoqu = max_xiaoqu[0]
     84                 max_xiaoqu = int(max_xiaoqu)
     85 
     86                 if max_xiaoqu <= 20 and max_xiaoqu != 0 :
     87                     print('该地区只有一页')
     88                     print('max_xiaoqu=%s' % max_xiaoqu)
     89                     max_page = 1
     90                     self.get_informations(p_url, max_page)
     91 
     92                 elif max_xiaoqu > 20:
     93                     # 找到最大页数,来确定循环的边界
     94                     max_page = element.xpath('//div[@class="pagebox"]/a[position()<last()]/text()')
     95                     max_page = max_page[-1].strip()
     96                     max_page = int(max_page)
     97                     self.get_informations(p_url, max_page)
     98 
     99                 # max_xiaoqu == 0的情况
    100                 else:
    101                     print(p_url+'该地区没有小区')
    102     
    103 
    104     def get_informations(self, g_url, max_page):
    105         print('='*20+'正在爬取网址[%s]'%g_url + '='*30)
    106             
    107         for pageNo in range(1, max_page+1):
    108             time.sleep(random.randint(1, 3))
    109             pageNo_url = g_url+'/?pageNo=%s' % pageNo
    110             print('='*10 + '正在爬取[%s]=======[%s]页' % (pageNo_url, pageNo) + '='*10 )
    111 
    112             response = requests.get(pageNo_url, headers=self.headers)
    113             html = response.text
    114             element = etree.HTML(html)
    115             lis = element.xpath('//ul[@class="lp-list"]//li')
    116 
    117             for li in lis:
    118                 information = {}
    119                 li_url = li.xpath('.//h3[@class="name"]/a/@href')
    120 
    121                 name = li.xpath('.//h3[@class="name"]/a/text()')
    122                 name = name[0]
    123 
    124                 address = li.xpath('.//div[@class="house-cont"]//p[@class="address"]/text()')
    125                 address = address[0]
    126                 address = re.sub(' ', '', address)
    127                 price = li.xpath('.//div[@class="fr"]/span/em/text()')
    128                 price = price[0].strip()
    129                 price = re.sub('元/㎡', '', price)
    130                 information = {
    131                     'url': li_url,
    132                     'name': name, 
    133                     'address': address, 
    134                     'price': price
    135                 }
    136                 save_to_mongo(information)        
    137 
    138 
    139 def main():
    140     f = Fangdd()
    141     url = 'https://shanghai.fangdd.com/'
    142     html = f.get_html(url)
    143     address = f.get_location(html)
    144     urls = f.get_all_url(address)
    145     f.parse_list_page(urls)
    146         
    147 
    148 if __name__ == '__main__':
    149     main()

    get_pinyin.py

     1 from pypinyin import pinyin
     2 
     3 
     4 def get_pinyin(text):
     5     # style=0参数设置取消声调,详情http://pypinyin.mozillazg.com/zh_CN/v0.9.1/
     6     p = pinyin(text, style=0)
     7     # [['chong'], ['qing']]
     8     a = []
     9     for i in p:
    10         a.append(i[0])
    11     # print(p)
    12     b = ''.join(a)
    13     return b

    save_to_mongo.py

     1 import pymongo
     2 
     3 client = pymongo.MongoClient('127.0.0.1', port=27017)
     4 db = client.fangdd_mark
     5 collection = db.informations
     6 
     7 def save_to_mongo(result):
     8     try:
     9         if collection.insert(result):
    10             pass
    11             # print('success save to mongodb')
    12     except Exception:
    13         print('error to mongo')
    14             

    因为设置了延迟,再加上数据量比较大,所以爬取时间有点长,我打完了一把王者荣耀,c开头的还没爬完,此时数据库中已经有22000条信息了

    运行结果:

  • 相关阅读:
    Viewer.js 图片预览插件使用
    SqlServer关于“无法删除数据库 "XXXX",因为该数据库当前正在使用”问题的解决方案
    MySQL数据类型详解
    Node.js安装详细步骤教程(Windows版)
    RGB颜色查询对照表
    HTML加载FLASH(*.swf文件)详解
    Cesium区分单击【LEFT_CLICK】和双击事件【LEFT_DOUBLE_CLICK】
    SpringBoot访问jsp页面
    Servlet详解
    Session的生命同期
  • 原文地址:https://www.cnblogs.com/MC-Curry/p/9552504.html
Copyright © 2020-2023  润新知