Day 39 爬虫_数据解析&验证码识别

import re
#提取出python
key="javapythonc++php"
re.findall('python',key)[0]
#####################################################################
#提取出hello world
key="
hello world
"
re.findall('
(.*)
',key)[0]
#####################################################################
#提取170
string = '我喜欢身高为170的女孩'
re.findall('d+',string)
#####################################################################
#提取出http://和https://
key='http://www.baidu.com and https://boob.com'
re.findall('https?://',key)
#####################################################################
#提取出hello
key='lalalahellohahah' #输出hello
re.findall('<[Hh][Tt][mM][lL]>(.*)',key)
#####################################################################
#提取出hit. 
key='bobo@hit.edu.com'#想要匹配到hit.
re.findall('h.*?.',key)
#####################################################################
#匹配sas和saas
key='saas and sas and saaas'
re.findall('sa{1,2}s',key)

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import re
import os
if __name__ == "__main__":
     url = 'https://www.qiushibaike.com/pic/%s/'
     headers={
         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
     }
     #指定起始也结束页码
     page_start = int(input('enter start page:'))
     page_end = int(input('enter end page:'))
     #创建文件夹
     if not os.path.exists('images'):
         os.mkdir('images')
     #循环解析且下载指定页码中的图片数据
     for page in range(page_start,page_end+1):
         print('正在下载第%d页图片'%page)
         new_url = format(url % page)
         response = requests.get(url=new_url,headers=headers)
         #解析response中的图片链接
         e = '
.*?.*?'
         pa = re.compile(e,re.S)
         image_urls = pa.findall(response.text)
          #循环下载该页码下所有的图片数据
         for image_url in image_urls:
             image_url = 'https:' + image_url
             image_name = image_url.split('/')[-1]
             image_path = 'images/'+image_name
             image_data = requests.get(url=image_url,headers=headers).content
             with open(image_path,'wb') as fp:
                 fp.write(image_data)

- 需要将pip源设置为国内源，阿里源、豆瓣源、网易源等
   - windows
    （1）打开文件资源管理器(文件夹地址栏中)
    （2）地址栏上面输入 %appdata%
    （3）在这里面新建一个文件夹  pip
    （4）在pip文件夹里面新建一个文件叫做  pip.ini ,内容写如下即可
        [global]
        timeout = 6000
        index-url = https://mirrors.aliyun.com/pypi/simple/
        trusted-host = mirrors.aliyun.com
   - linux
    （1）cd ~
    （2）mkdir ~/.pip
    （3）vi ~/.pip/pip.conf
    （4）编辑内容，和windows一模一样
- 需要安装：pip install bs4
     bs4在使用时候需要一个第三方库，把这个库也安装一下
     pip install lxml

使用流程：       
    - 导包：from bs4 import BeautifulSoup
    - 使用方式：可以将一个html文档，转化为BeautifulSoup对象，然后通过对象的方法或者属性去查找指定的节点内容
        （1）转化本地文件：
             - soup = BeautifulSoup(open('本地文件'), 'lxml')
        （2）转化网络文件：
             - soup = BeautifulSoup('字符串类型或者字节类型', 'lxml')
        （3）打印soup对象显示内容为html文件中的内容
基础巩固：
    （1）根据标签名查找
        - soup.a   只能找到第一个符合要求的标签
    （2）获取属性
        - soup.a.attrs  获取a所有的属性和属性值，返回一个字典
        - soup.a.attrs['href']   获取href属性
        - soup.a['href']   也可简写为这种形式
    （3）获取内容
        - soup.a.string
        - soup.a.text
        - soup.a.get_text()
       【注意】如果标签还有标签，那么string获取到的结果为None，而其它两个，可以获取文本内容
    （4）find：找到第一个符合要求的标签
        - soup.find('a')  找到第一个符合要求的
        - soup.find('a', title="xxx")
        - soup.find('a', alt="xxx")
        - soup.find('a', class_="xxx")
        - soup.find('a', id="xxx")
    （5）find_all：找到所有符合要求的标签
        - soup.find_all('a')
        - soup.find_all(['a','b']) 找到所有的a和b标签
        - soup.find_all('a', limit=2)  限制前两个
    （6）根据选择器选择指定的内容
               select:soup.select('#feng')
        - 常见的选择器：标签选择器(a)、类选择器(.)、id选择器(#)、层级选择器
            - 层级选择器：
                div .dudu #lala .meme .xixi  下面好多级
                div > p > a > .lala          只能是下面一级
        【注意】select选择器返回永远是列表，需要通过下标提取指定的对象

import requests
from bs4 import BeautifulSoup

url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36 OPR/67.0.3575.115 (Edition B2)'
}
req_txt = requests.get(url=url,headers=header).text
soup = BeautifulSoup(req_txt,'lxml')
title_list = soup.select('.book-mulu > ul > li > a')
fp = open(r'C:UsersAdministratorDesktopsanguo.txt', 'w', encoding='utf-8')
for i in title_list:
    title = i.string
    data_url = 'http://www.shicimingju.com' + i['href']
    data_txt = requests.get(url = data_url,headers=header).text
    data_suop = BeautifulSoup(data_txt,'lxml')
    data_tag =data_suop.find('div',class_='chapter_content')
    data_info = data_tag.text
    fp.write(title + data_info + '

')
    print(title + '爬取成功！！！')

层级定位：
from lxml import etree
tree = etree.HTML(req_txt)
tree_list = tree.xpath('/html/body/div')
属性定位：
    #找到class属性值为song的div标签
    tree.xpath('//div[@class="song"] ')
    //div[@class="song"] 
层级&索引定位：
    #找到class属性值为tang的div的直系子标签ul下的第二个子标签li下的直系子标签a
    //div[@class="tang"]/ul/li[2]/a
取文本：
    # /表示获取某个标签下的文本内容
    # //表示获取某个标签下的文本内容和所有子标签下的文本内容
    //div[@class="song"]/p[1]/text()
    //div[@class="tang"]//text()
取属性：
    //div[@class="tang"]//li[2]/a/@href
逻辑运算：
    #找到href属性值为空且class属性值为du的a标签
    //a[@href="" and @class="du"]
模糊匹配：
    //div[contains(@class, "ng")]
    //div[starts-with(@class, "ta")]

- 本地文件：tree = etree.parse(文件名)
                tree.xpath("xpath表达式")
- 网络数据：tree = etree.HTML(网页内容字符串)
                tree.xpath("xpath表达式")

import requests
from lxml import etree
url = 'https://hz.58.com/ershoufang/'
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36 OPR/67.0.3575.115 (Edition B2)'
}
req_txt = requests.get(url=url,headers=header).text
tree = etree.HTML(req_txt)
# li_list = tree.xpath('//ul[@class="house-list-wrap"]/li/div[2]/h2[@class="title"]/a/text()')
li_list = tree.xpath('//ul[@class="house-list-wrap"]/li')
fp = open(r'C:UsersAdministratorDesktop58title.txt', 'w', encoding='utf-8')
for i in li_list:
    title = i.xpath('./div[2]/h2/a/text()')[0]
    fp.write(title + '
')
    print(title)

import requests
import os
from lxml import etree

url = 'http://pic.netbian.com/4kmeinv/'
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36 OPR/67.0.3575.115 (Edition B2)'
}
# 手动设置相应编码
# req_data = requests.get(url=url, headers=header)
# req_data.encoding = 'utf-8'
req_txt = requests.get(url=url, headers=header).text
tree = etree.HTML(req_txt)
li_list = tree.xpath('//ul[@class="clearfix"]/li')

if not os.path.exists(r'C:UsersAdministratorDesktoppic'):
    os.makedirs(r'C:UsersAdministratorDesktoppic')

for i in li_list:
    pic_url = 'http://pic.netbian.com' + i.xpath('./a/img/@src')[0]
    print(pic_url)
    pic_alt = i.xpath('./a/img/@alt')[0] + '.jpg'
    # 通用处理中文乱码问题的解决方案
    pic_alt = pic_alt.encode('iso-8859-1').decode('gbk')
    pic_info = requests.get(url=pic_url, headers=header).content
    pic_name = pic_alt + '.jpg'
    pic = r'C:UsersAdministratorDesktoppic/' + pic_name
    with open(pic,'wb') as f:
        f.write(pic_info)
    print(pic_name + '下载完成！！！')
print('over')

import requests
from lxml import etree

url = 'https://www.aqistudy.cn/historydata'
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36 OPR/67.0.3575.115 (Edition B2)'
}
req_txt = requests.get(url=url, headers=header).text
tree = etree.HTML(req_txt)
# hot_li_list = tree.xpath('//div[@class="hot"]/div[@class="bottom"]/ul/li')
# all_li_list = tree.xpath('//div[@class="all"]/div[@class="bottom"]/ul/div[2]/li')
# hot_city_list = []
# all_city_list = []
# for i in hot_li_list:
#     hot_city = i.xpath('./a/text()')[0]
#     hot_city_list.append(hot_city)
#
# for i in all_li_list:
#     all_city = i.xpath('./a/text()')[0]
#     all_city_list.append(all_city)
#
# print(hot_city_list)
# print(all_city_list)
# print(len(all_city_list))

# 使用一个 xpth 获取热门城市和全部城市
all_list = []
all = tree.xpath('//div[@class="hot"]/div[@class="bottom"]/ul/li/a | //div[@class="all"]/div[@class="bottom"]/ul/div[2]/li/a')
for i in all:
    city = i.xpath('./text()')[0]
    all_list.append(city)
print(all_list)
print(len(all_list))

import requests
import os
from lxml import etree

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36 OPR/67.0.3575.115 (Edition B2)'
}
url = 'http://sc.chinaz.com/jianli/free_%d.html'

for u in range(1, 3):
    if u == 1:
        new_url = 'http://sc.chinaz.com/jianli/free.html'
    else:
        new_url = format(url % u)

    req_txt = requests.get(url=new_url, headers=header).text
    tree = etree.HTML(req_txt)
    tree_list = tree.xpath('//div[@id = "container"]')

    if not os.path.exists(r'C:UsersAdministratorDesktopjianli'):
        os.makedirs(r'C:UsersAdministratorDesktopjianli')

    for i in tree_list:
        list_url = i.xpath('.//a[1]/@href')
        for li in list_url:
            jianli_txt = requests.get(url=li, headers=header).text
            jianli_tree = etree.HTML(jianli_txt)
            jianli_url = jianli_tree.xpath('//ul[@class="clearfix"]/li/a/@href')[1]
            jianli_txt = requests.get(url=jianli_url, headers=header).content
            janli_name = jianli_url.split('/')[-1]
            file_url = r'C:UsersAdministratorDesktopjianli/' + janli_name
            with open(file_url, 'wb') as f:
                f.write(jianli_txt)
            print(janli_name + '   爬取成功！！！')

print('爬取完毕')

import http.client, mimetypes, urllib, json, time, requests
######################################################################
class YDMHttp:
    apiurl = 'http://api.yundama.com/api.php'
    username = ''
    password = ''
    appid = ''
    appkey = ''
    def __init__(self, username, password, appid, appkey):
        self.username = username  
        self.password = password
        self.appid = str(appid)
        self.appkey = appkey
    def request(self, fields, files=[]):
        response = self.post_url(self.apiurl, fields, files)
        response = json.loads(response)
        return response
    def balance(self):
        data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['balance']
        else:
            return -9001
    def login(self):
        data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['uid']
        else:
            return -9001
    def upload(self, filename, codetype, timeout):
        data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
        file = {'file': filename}
        response = self.request(data, file)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['cid']
        else:
            return -9001
    def result(self, cid):
        data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)}
        response = self.request(data)
        return response and response['text'] or ''
    def decode(self, filename, codetype, timeout):
        cid = self.upload(filename, codetype, timeout)
        if (cid > 0):
            for i in range(0, timeout):
                result = self.result(cid)
                if (result != ''):
                    return cid, result
                else:
                    time.sleep(1)
            return -3003, ''
        else:
            return cid, ''
    def report(self, cid):
        data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
        response = self.request(data)
        if (response):
            return response['ret']
        else:
            return -9001
    def post_url(self, url, fields, files=[]):
        for key in files:
            files[key] = open(files[key], 'rb');
        res = requests.post(url, files=files, data=fields)
        return res.text

######################################################################
# 用户名
username    = 'username'
# 密码
password    = 'password'                            
# 软件ＩＤ，开发者分成必要参数。登录开发者后台【我的软件】获得！
appid       = 1                                     
# 软件密钥，开发者分成必要参数。登录开发者后台【我的软件】获得！
appkey      = '22cc5376925e9387a23cf797cb9ba745'    
# 图片文件
filename    = 'getimage.jpg'                        
# 验证码类型，# 例：1004表示4位字母数字，不同类型收费不同。请准确填写，否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
codetype    = 1004
# 超时时间，秒
timeout     = 60                                    
# 检查
if (username == 'username'):
    print('请设置好相关参数再测试')
else:
    # 初始化
    yundama = YDMHttp(username, password, appid, appkey)
    # 登陆云打码
    uid = yundama.login();
    print('uid: %s' % uid)
    # 查询余额
    balance = yundama.balance();
    print('balance: %s' % balance)
    # 开始识别，图片路径，验证码类型ID，超时时间（秒），识别结果
    cid, result = yundama.decode(filename, codetype, timeout);
    print('cid: %s, result: %s' % (cid, result))
######################################################################

# -*- coding: utf-8 -*-

# @File    : YDM.py
# @Date    : 2020-04-12
# @Author  : Administrator
import http.client, mimetypes, urllib, json, time, requests

class YDMHttp:
    apiurl = 'http://api.yundama.com/api.php'
    username = ''
    password = ''
    appid = ''
    appkey = ''

    def __init__(self, username, password, appid, appkey):
        self.username = username
        self.password = password
        self.appid = str(appid)
        self.appkey = appkey

    def request(self, fields, files=[]):
        response = self.post_url(self.apiurl, fields, files)
        response = json.loads(response)
        return response

    def balance(self):
        data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['balance']
        else:
            return -9001

    def login(self):
        data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['uid']
        else:
            return -9001

    def upload(self, filename, codetype, timeout):
        data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
        file = {'file': filename}
        response = self.request(data, file)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['cid']
        else:
            return -9001

    def result(self, cid):
        data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'cid': str(cid)}
        response = self.request(data)
        return response and response['text'] or ''

    def decode(self, filename, codetype, timeout):
        cid = self.upload(filename, codetype, timeout)
        if (cid > 0):
            for i in range(0, timeout):
                result = self.result(cid)
                if (result != ''):
                    return cid, result
                else:
                    time.sleep(1)
            return -3003, ''
        else:
            return cid, ''

    def report(self, cid):
        data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
        response = self.request(data)
        if (response):
            return response['ret']
        else:
            return -9001

    def post_url(self, url, fields, files=[]):
        for key in files:
            files[key] = open(files[key], 'rb')
        res = requests.post(url, files=files, data=fields)
        return res.text

Day 39 爬虫_数据解析&验证码识别

引入

python如何实现数据解析

1、正则表达式

常用正则表达式回顾

正则练习

2、bs4解析

环境安装

基础使用

3、xpath解析

引入

环境安装

解析原理

常用xpath表达式

etree对象实例化

验证码识别

what is 验证码？

验证码和爬虫的爱恨情仇

云打码平台

使用流程

示例代码展示

平台提供的类

平台提供的调用程序