• 案例-爬去南京地区短租房信息


    啦啦,网站屏蔽太严重。很难完成

    #爬去小猪短租网南京地区短租网13页信息
    #导入beautifulsoup库和request库和time库
    from bs4 import BeautifulSoup
    import requests
    import time
    
    #加入请求头:User-Agent,伪装成浏览器
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
    }
    
    def judgment_sex(class_name):
        if(class_name==['member_icol']):
            return ''
        else:
            return ''
            
    
    def get_links(url):
        web_data=requests.get(url,headers=headers)
        soup=BeautifulSoup(web_data.text,'lxml')
        links=soup.select('#page_list > ul > li > a')
        for link in links:
            href=link.get('href')
            print(href)
            
    if __name__=='__main__':
        urls='http://nj.xiaozhu.com/search-duanzufang-p2-0/'
        get_links(urls)
     1 #爬去小猪短租网南京地区短租网13页信息
     2 #导入beautifulsoup库和request库和time库
     3 from bs4 import BeautifulSoup
     4 import requests
     5 import time
     6 
     7 #加入请求头:User-Agent,伪装成浏览器
     8 headers={    
     9     'User-Agent':'Nokia6600/1.0 (3.42.1) SymbianOS/7.0s Series60/2.0 Profile/MIDP-2.0 Configuration/CLDC-1.0'
    10     #'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
    11 }
    12 
    13 def judgment_sex(class_name):
    14     if(class_name==['member_icol']):
    15         return ''
    16     else:
    17         return ''
    18         
    19 
    20 def get_links(url):
    21     web_data=requests.get(url,headers=headers)
    22     soup=BeautifulSoup(web_data.text,'lxml')
    23     links=soup.select('#page_list > ul > li > a')
    24     #href=links[0].get('href')
    25     #time.sleep(6)
    26     #get_info(href)
    27     for link in links:
    28         href=link.get('href')
    29         time.sleep(10)
    30         get_info(href)
    31     
    32     
    33 
    34 def get_info(url):
    35     wb_data=requests.get(url,headers=headers)
    36     print(wb_data)
    37     print('666666666666666666666666666666')
    38     soup=BeautifulSoup(wb_data.text,'lxml')
    39     #tittles=soup.select('div.pho_info > h4')
    40     #print(tittles)
    41     
    42     tittles=soup.select(' div.pho_info > h4')
    43     addresses=soup.select('span.pr5')    
    44     prices=soup.select('#pricePart > div.day_l > span')    
    45     imgs=soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')    
    46     names=soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')    
    47     sexs=soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')    
    48     for tittle,addresse,price,img,name,sex in zip(tittles,addresses,prices,imgs,names,sexs):
    49         data={
    50             'tittle':tittle.get_text().strip(),
    51             'addresse':addresse.get_text().strip(),
    52             'price':price.get_text(),
    53             'img':img.get("src"),
    54             'name':name.get_text(),
    55             'sex':judgment_sex(sex.get("class"))
    56         }
    57         print(data)
    58     
    59 
    60 if __name__=='__main__':
    61     urls=['http://nj.xiaozhu.com/search-duanzufang-p{}-0/'.format(number) for number in range(1,15)]
    62     for single_url in urls:
    63         get_links(single_url)
    64         #print(single_url)

     

  • 相关阅读:
    【渗透测试】hydra使用小结
    Git/SQL/正则表达式的在线练习平台
    加密算法的前世今生
    Linux的进程、线程、文件描述符是什么
    一文看懂 session 和 cookie
    Linux 文件目录都是什么鬼?
    Linux shell 的实用小技巧
    关于 Linux shell 你必须知道的
    我用四个命令概括了 Git 的所有套路
    如何寻找最长回文子串
  • 原文地址:https://www.cnblogs.com/lanbofei/p/8709703.html
Copyright © 2020-2023  润新知