爬虫（二）

　　在开发工具内获取“请求头”来伪装成浏览器，以便更好地抓取数据

!/usr/bin/env python
-*- encoding:UTF-8 -*-

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}
res = requests.get('http://bj.xiaozhu.com/',headers=headers)    # get方法加入请求头
try:
    print(res.text)
except ConnectionError:
    print('拒绝连接')


# 通过BeautiSoup库解析得到的Soup文档是标准结构化数据比上面的更好
import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}
res = requests.get('http://bj.xiaozhu.com/',headers=headers)    # get方法加入请求头
try:
    soup = BeautifulSoup(res.text, 'html.parser')
    print(soup.prettify())
except ConnectionError:
    print('拒绝连接')

更新后：

price = soup.select('#page_list > ul > li:nth-child(1) > div.result_btm_con.lodgeunitname > div:nth-child(1) > '
                    'span.result_price > i')

完整代码

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 '
                  'Safari/537.36 '
}
res = requests.get('http://bj.xiaozhu.com/', headers=headers)  # get方法加入请求头

soup = BeautifulSoup(res.text, 'html.parser')
# 定位元素位置并通过selector方法提取
prices = soup.select(
    '#page_list > ul > li > div.result_btm_con.lodgeunitname > div:nth-child(1) >  span.result_price > i')
for price in prices:
    print(price.get_text())
　　# print(prince) 带有标签

爬取北京地区短租房信息：

import random

import requests
from bs4 import BeautifulSoup
import time

# 加入请求头伪装成浏览器
headers = {
    # 通过Chrome浏览器复制User-Agent
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}


# 定义判断用户性别的函数
def judgment_sex(class_name):
    if class_name == ['member_ico1']:
        return '女'
    else:
        return '男'


# 获取详细页URL函数
def get_links(url):
    try:
        wb_date = requests.get(url, headers)
    except ConnectionAbortedError:
        print('拒绝连接')
    soup = BeautifulSoup(wb_date.text, 'lxml')
    links = soup.select('#page_list > ul > li > a')
    for link in links:
        herf = link.get("href")
        get_info(herf)


# 获取网页信息函数
def get_info(url):
    wb_date = requests.get(url, headers)
    soup = BeautifulSoup(wb_date.text, 'lxml')
    # 通过浏览器copy selector
    tittles = soup.select('div.pho_info > h4')
    addresses = soup.select('span.pr5')
    prises = soup.select('#pricePart > div.day_l > span')
    images = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
    names = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
    sexs = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')
    for tittle, address, prise, image, name, sex in zip(tittles, addresses, prises, images, names, sexs):
        date = {
            'tittle': tittle.get_text().strip(),
            'address': address.get_text().strip(),
            'price': prise.get_text(),
            'image': image.get("src"),
            'name': name.get_text(),
            'sex': judgment_sex(sex.get("class"))
        }
        print(date)


if __name__ == '__main__':

    urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(number) for number in range(1, 14)]
    for single_url in urls:
        get_links(single_url)
        # 休眠十秒，防止被封IP
        time.sleep(random.randint(10, 13))

        # 缺点：缺少IP管理，采用休眠方法，效率低

爬取酷狗top1.0版：

#!/usr/bin/env python
# -*- encoding:UTF-8 -*-

from bs4 import BeautifulSoup
import requests
import time,random

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'
}

def get_info(url):
    """获取信息函数"""
    wb_data = requests.get(url,headers)
    soup = BeautifulSoup(wb_data.text,'lxml')
    ranks = soup.select('span.pc_temp_num')
    titles =soup.select('div.pc_temp_songlist>ul>li>a')
    times = soup.select('span.pc_temp_tips_r>span')
    for rank,title,time in zip(ranks,titles,times):
        data = {
            'rank':rank.get_text().strip(),
            'singer':title.get_text().split('-')[0],
            'song':title.get_text().split('-')[1],
            'time':time.get_text().strip()
        }
        print(data)



if __name__ == '__main__':
    """主程序入口"""
    urls = ['https://www.kugou.com/yy/rank/home/{}-8888.html'.format(i) for i in range(1,25)]
    for url in urls:
        get_info(url)
    time.sleep(random.randint(3,5))

爬取酷狗top1.1版：

#!/usr/bin/env python
# -*- encoding:UTF-8 -*-

from bs4 import BeautifulSoup
import requests
import time,random

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'
}

def get_info(url):
    """获取信息函数"""
    wb_data = requests.get(url,headers)
    soup = BeautifulSoup(wb_data.text,'lxml')
    ranks = soup.select('span.pc_temp_num')
    titles =soup.select('a.pc_temp_songname')
    times = soup.select('span.pc_temp_time')
    for rank,title,time in zip(ranks,titles,times):
        data = {
            'rank':rank.get_text().strip(),
            'singer':title.get_text().split('-')[0],
            'song':title.get_text().split('-')[1],
            'time':time.get_text().strip()
        }
        print(data)



if __name__ == '__main__':
    """主程序入口"""
    urls = ['https://www.kugou.com/yy/rank/home/{}-8888.html'.format(i) for i in range(1,25)]
    for url in urls:
        get_info(url)
    time.sleep(random.randint(3,5))

爬取价格：

import re
import requests
res = requests.get('http://bj.xiaozhu.com/')
prices = re.findall('<span class="result_price">&#165;<i>(.*?)</i>起/晚</span>', res.text)
for price in prices:
    print(price)

注意：

<span class="result_price">¥<i>488</i>起/晚</span>

¥   和  &#165;等价，但爬取时，不能出现¥

相关阅读:
Why Choose Jetty?
Jetty 的工作原理以及与 Tomcat 的比较
 Tomcat设计模式
 Servlet 工作原理解析
 Tomcat 系统架构
 spring boot 打包方式 spring boot 整合mybaits REST services
wireshark udp 序列号 User Datagram Protocol UDP
Maven 的聚合（多模块）和 Parent 继承
 缓存策略半自动化就是mybaitis只支持数据库查出的数据映射到pojo类上，而实体到数据库的映射需要自己编写sql语句实现，相较于hibernate这种完全自动化的框架我更喜欢mybatis
Mybatis解决sql中like通配符模糊匹配构造方法覆盖 mybits 增删改
原文地址：https://www.cnblogs.com/King-boy/p/10901389.html