• python爬虫实战<一>


    #!/usr/bin/env python
    #-*- coding:utf-8-*-
    
    """
    @author:    wangzhu
    @desc:  get qian cheng wu you qiu zhi wang information
    @contact:   isaac.zhu@dbappsecurity.com.cn
    @data:  2019/8/7
    """
    
    import requests  #导入请求包
    import re  #导入正则包
    from random import randint
    
    
    """
    网站地址:https://www.danke.com/room/hz
    """
    
    #Some User Agents
    hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},
        {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},
        {'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},
        {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},
        {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},
        {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
        {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
        {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},
        {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
        {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
        {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},
        {'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},
        {'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}]
    
    def CrawlerHouse():
        url="https://www.danke.com/room/hz"
        res=requests.get(url,headers=hds[randint(0,len(hds)-1)])
        res.encoding="utf-8"  #对返回数据进行排版
        #print(res.text)  #<div class="r_ls_box">  /<div class="r_lbx">
        infolist=re.findall('<div class="r_lbx_cena">(.*?)</div>',res.text,re.S)  #获取所有数据
        i=1
        for one in infolist:  #每一条的数据
            two=one.split('<span class="location">{0}</span>'.format(i))  #过滤掉<span class="location">{0}</span>,以防止 >与下方的>冲突
            two=''.join(two) #去掉外边中括号
            #print(two)
            #获取岗位名称
            job=re.findall('>(.*?)</a>',two,re.S)
            job=''.join(job).strip()  #删除两边空字符
            print(job)
            i+=1
    
    #运行程序
    CrawlerHouse()

        <div class="r_ls_box">
                                
                <div class="r_lbx">
                    <a href="javascript:void(0)" class="rimg" key='0' xiaoqu='万科北宸之光'>
                        <span class="img-hint">
                            <span></span>
                            <span></span>
                        </span>
                        <img
                                src="https://public.danke.com.cn/public-20190123-isz_ljR3BG1JKKfa2lXEilpNXgN1NTRV?imageView2/1/w/380/h/285" width="260" height="173"
                                title=""
                                alt="图片"/>
    
                                        </a>
                    <div class="r_lbx_cen">
                        <div class="r_lbx_cena">
                            <span class="location">1</span>
                            <a href="https://www.danke.com/duanzu/1913140756.html" key='0' xiaoqu='万科北宸之光' target="_blank"
                               title="万达广场  万科北宸之光 3室2厅">
                                万达广场  万科北宸之光 3室2厅
                            </a>
                                                        <div class="r_lbx_cena">
                                    <div class="sub_img"></div>
                                    距5号线大运河站2700米
                                </div>
                                                </div>
                        <div class="r_lbx_cenb">
                            <div class="address_img"></div>
                            建筑面积约12㎡ | 21楼
                            | 3室1卫                          | 朝南
                                                        <i>合</i>
                                                </div>
                        <div class="r_lbx_cenc">
                                                                        </div>
                                        </div>
                    <div class="r_lbx_money">
                                                <div class="r_lbx_moneya">
                                                                <span class="ty_b">1890</span> 元/月
                                                        </div>
    
                                            <a class="lk_more" key='0' xiaoqu='万科北宸之光' href="https://www.danke.com/duanzu/1913140756.html"
                           target="_blank">
                            查看详情
                        </a>
                    </div>
                </div>
    
    
    <div class="r_ls_box">
    <DIV类= “r_ls_box”>
  • 相关阅读:
    TextBox 只有下划线
    can't find web control library(web控件库)
    DropDownListSalesAC”有一个无效 SelectedValue,因为它不在项目列表中。
    IDE、SATA、SCSI、SAS、FC、SSD 硬盘类型
    如何打印1px表格
    CSS控制打印 分页
    Virtual Server could not open its emulated Ethernet switch driver. To fix this problem, reenable the Virtual Server Emulated Et
    Xml中SelectSingleNode方法中的xpath用法
    热带水果莫入冰箱?水果存放冰箱大法
    探索Asp.net的Postback机制
  • 原文地址:https://www.cnblogs.com/gufengchen/p/12420798.html
Copyright © 2020-2023  润新知