• 爬虫(二)


      在开发工具内获取“请求头”来伪装成浏览器,以便更好地抓取数据

    !/usr/bin/env python
    -*- encoding:UTF-8 -*-
    
    import requests
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
    }
    res = requests.get('http://bj.xiaozhu.com/',headers=headers)    # get方法加入请求头
    try:
        print(res.text)
    except ConnectionError:
        print('拒绝连接')
    
    
    # 通过BeautiSoup库解析得到的Soup文档是标准结构化数据比上面的更好
    import requests
    from bs4 import BeautifulSoup
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
    }
    res = requests.get('http://bj.xiaozhu.com/',headers=headers)    # get方法加入请求头
    try:
        soup = BeautifulSoup(res.text, 'html.parser')
        print(soup.prettify())
    except ConnectionError:
        print('拒绝连接')

    更新后:

    price = soup.select('#page_list > ul > li:nth-child(1) > div.result_btm_con.lodgeunitname > div:nth-child(1) > '
                        'span.result_price > i')

    完整代码

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 '
                      'Safari/537.36 '
    }
    res = requests.get('http://bj.xiaozhu.com/', headers=headers)  # get方法加入请求头
    
    soup = BeautifulSoup(res.text, 'html.parser')
    # 定位元素位置并通过selector方法提取
    prices = soup.select(
        '#page_list > ul > li > div.result_btm_con.lodgeunitname > div:nth-child(1) >  span.result_price > i')
    for price in prices:
        print(price.get_text())
      # print(prince) 带有标签

    爬取北京地区短租房信息:

    import random
    
    import requests
    from bs4 import BeautifulSoup
    import time
    
    # 加入请求头伪装成浏览器
    headers = {
        # 通过Chrome浏览器复制User-Agent
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
    }
    
    
    # 定义判断用户性别的函数
    def judgment_sex(class_name):
        if class_name == ['member_ico1']:
            return ''
        else:
            return ''
    
    
    # 获取详细页URL函数
    def get_links(url):
        try:
            wb_date = requests.get(url, headers)
        except ConnectionAbortedError:
            print('拒绝连接')
        soup = BeautifulSoup(wb_date.text, 'lxml')
        links = soup.select('#page_list > ul > li > a')
        for link in links:
            herf = link.get("href")
            get_info(herf)
    
    
    # 获取网页信息函数
    def get_info(url):
        wb_date = requests.get(url, headers)
        soup = BeautifulSoup(wb_date.text, 'lxml')
        # 通过浏览器copy selector
        tittles = soup.select('div.pho_info > h4')
        addresses = soup.select('span.pr5')
        prises = soup.select('#pricePart > div.day_l > span')
        images = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
        names = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
        sexs = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')
        for tittle, address, prise, image, name, sex in zip(tittles, addresses, prises, images, names, sexs):
            date = {
                'tittle': tittle.get_text().strip(),
                'address': address.get_text().strip(),
                'price': prise.get_text(),
                'image': image.get("src"),
                'name': name.get_text(),
                'sex': judgment_sex(sex.get("class"))
            }
            print(date)
    
    
    if __name__ == '__main__':
    
        urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(number) for number in range(1, 14)]
        for single_url in urls:
            get_links(single_url)
            # 休眠十秒,防止被封IP
            time.sleep(random.randint(10, 13))
    
            # 缺点:缺少IP管理,采用休眠方法,效率低

     爬取酷狗top1.0版:

    #!/usr/bin/env python
    # -*- encoding:UTF-8 -*-
    
    from bs4 import BeautifulSoup
    import requests
    import time,random
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'
    }
    
    def get_info(url):
        """获取信息函数"""
        wb_data = requests.get(url,headers)
        soup = BeautifulSoup(wb_data.text,'lxml')
        ranks = soup.select('span.pc_temp_num')
        titles =soup.select('div.pc_temp_songlist>ul>li>a')
        times = soup.select('span.pc_temp_tips_r>span')
        for rank,title,time in zip(ranks,titles,times):
            data = {
                'rank':rank.get_text().strip(),
                'singer':title.get_text().split('-')[0],
                'song':title.get_text().split('-')[1],
                'time':time.get_text().strip()
            }
            print(data)
    
    
    
    if __name__ == '__main__':
        """主程序入口"""
        urls = ['https://www.kugou.com/yy/rank/home/{}-8888.html'.format(i) for i in range(1,25)]
        for url in urls:
            get_info(url)
        time.sleep(random.randint(3,5))

     爬取酷狗top1.1版:

    #!/usr/bin/env python
    # -*- encoding:UTF-8 -*-
    
    from bs4 import BeautifulSoup
    import requests
    import time,random
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'
    }
    
    def get_info(url):
        """获取信息函数"""
        wb_data = requests.get(url,headers)
        soup = BeautifulSoup(wb_data.text,'lxml')
        ranks = soup.select('span.pc_temp_num')
        titles =soup.select('a.pc_temp_songname')
        times = soup.select('span.pc_temp_time')
        for rank,title,time in zip(ranks,titles,times):
            data = {
                'rank':rank.get_text().strip(),
                'singer':title.get_text().split('-')[0],
                'song':title.get_text().split('-')[1],
                'time':time.get_text().strip()
            }
            print(data)
    
    
    
    if __name__ == '__main__':
        """主程序入口"""
        urls = ['https://www.kugou.com/yy/rank/home/{}-8888.html'.format(i) for i in range(1,25)]
        for url in urls:
            get_info(url)
        time.sleep(random.randint(3,5))

     爬取价格:

    import re
    import requests
    res = requests.get('http://bj.xiaozhu.com/')
    prices = re.findall('<span class="result_price">&#165;<i>(.*?)</i>起/晚</span>', res.text)
    for price in prices:
        print(price)

    注意:

    <span class="result_price">¥<i>488</i>起/晚</span>
    ¥   和  &#165;等价,但爬取时,不能出现¥ 
  • 相关阅读:
    Why Choose Jetty?
    Jetty 的工作原理以及与 Tomcat 的比较
    Tomcat设计模式
    Servlet 工作原理解析
    Tomcat 系统架构
    spring boot 打包方式 spring boot 整合mybaits REST services
    wireshark udp 序列号 User Datagram Protocol UDP
    Maven 的聚合(多模块)和 Parent 继承
    缓存策略 半自动化就是mybaitis只支持数据库查出的数据映射到pojo类上,而实体到数据库的映射需要自己编写sql语句实现,相较于hibernate这种完全自动化的框架我更喜欢mybatis
    Mybatis解决sql中like通配符模糊匹配 构造方法覆盖 mybits 增删改
  • 原文地址:https://www.cnblogs.com/King-boy/p/10901389.html
Copyright © 2020-2023  润新知