• day02


    1、爬取ajax请求的数据(循环):

    from urllib.request import Request,urlopen
    base_url = "https://movie.douban.com/j/chart/top_list?type=24&interval_id=100%3A90&action=&start={0}&limit={1}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"
    }
    i = 0
    while True:
        url = base_url.format(i*20,20)
        print(url)
        request = Request(url,headers=headers)
        response = urlopen(request)
        content = response.read().decode()
        if content == "[]" or content == None:    # 爬取的内容为空时,就停止
            break;
        print(content)
        print(i)
        i += 1

     2、忽略证书问题:

     3、proxy代理:

       1) 自己创建build_opener对象,调用open方法:

    from urllib.request import Request,build_opener
    from fake_useragent import UserAgent
    
    url = "http://httpbin.org/get"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"
    }
    request = Request(url,headers=headers)
    # 之前都是通过urlopen,里面也是创建build_opener类,再调用open方法
    # 现在自己创建build_opener,调用open方法
    openr = build_opener()
    response = openr.open(request)  # 去请求url
    print(response.read().decode())

       2) 代理ip:防止自己的ip被封 : 

    from urllib.request import Request,build_opener,ProxyHandler
    from fake_useragent import UserAgent
    url = "http://httpbin.org/get"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"
    }
    request = Request(url,headers=headers)
    
    # proxy = ProxyHandler({"http":"username:pwd@120.27.224.41:16818"})#独享ip,花钱买
    proxy = ProxyHandler({"http":"163.125.249.117:8118"}) # 免费ip
    openr = build_opener(proxy)
    response = openr.open(request)  
    print(response.read().decode())

    4、cookies:

      1)、使用自己登录的cookies:

    from urllib.request import HTTPCookieProcessor,Request,build_opener
    from fake_useragent import UserAgent
    url = "https://www.sxt.cn/index/user.html"
    headers = {
        "User-Agent":UserAgent().random,
        # 自己登录网站,拿网站返回的cookie , cookie放到请求头中
        "Cookie":"UM_distinctid=1677e8c7912122-0191ed60274b72-5c10301c-e1000-1677e8c791420d; 53gid2=10689069155008; 53gid1=10689069155008; 53revisit=1544016395113; 53gid0=10689069155008; PHPSESSID=rgp2u6edmipoflg2i4ivqefp70; visitor_type=old; 53kf_72085067_from_host=www.sxt.cn;  53kf_72085067_land_page=https%253A%252F%252Fwww.sxt.cn%252Findex.html;  kf_72085067_land_page_ok=1; 53kf_72085067_keyword=https%3A%2F%2Fwww.sxt.cn%2Findex%2Flogin%2Flogin.html; CNZZDATA1261969808=954793838-1544013478-https%253A%252F%252Fwww.sxt.cn%252F%7C1544071377"
    }
    request = Request(url,headers=headers)
    opener = build_opener()
    response = opener.open(request)
    print(response.read().decode())

      2)、使用写代码登录的cookies:

    from urllib.request import HTTPCookieProcessor,Request,build_opener
    from fake_useragent import UserAgent
    from urllib.parse import urlencode
    
    login_url = "https://www.sxt.cn/index/login/login"
    data = {
        "user": "17703181473",
        "password": "123456",
    }
    headers = {
        "User-Agent":UserAgent().random,
    }
    request = Request(login_url,headers=headers,data=urlencode(data).encode())
    cookie = HTTPCookieProcessor()
    opener = build_opener(cookie)          # 将返回的cookie放到opener中,下次再用这个opener请求url
    response = opener.open(request)
    
    user_url = "https://www.sxt.cn/index/user.html"
    request = Request(user_url,headers=headers)
    response = opener.open(request)
    print(response.read().decode())

      3)、使用cookiejar,将cookie写到文件中:

    from urllib.request import HTTPCookieProcessor, Request, build_opener
    from fake_useragent import UserAgent
    from urllib.parse import urlencode
    from http.cookiejar import MozillaCookieJar
    
    headers = {
        "User-Agent": UserAgent().random,
    }
    
    # 获得cookie,存进文件中
    def get_cookie():
        login_url = "https://www.sxt.cn/index/login/login"
        data = {
            "user": "17703181473",
            "password": "123456",
        }
        request = Request(login_url,
                      headers=headers,   data=urlencode(data).encode())
        cookiejar = MozillaCookieJar()
        cookie = HTTPCookieProcessor(cookiejar)
        opener = build_opener(cookie)
        response = opener.open(request)
        # cookiejar:将cookie存到文件中,discard:丢弃的cookie,expires:无效的cookie
        cookiejar.save("cookie.txt", ignore_discard=True, ignore_expires=True)
    
    # 使用文件中cookie,进入用户界面
    def use_cookie():
        user_url = "https://www.sxt.cn/index/user.html"
        request = Request(user_url, headers=headers)
        cookiejar = MozillaCookieJar()
        # 后面两个True需要加的
        cookiejar.load("cookie.txt",ignore_expires=True,ignore_discard=True)
        cookie = HTTPCookieProcessor(cookiejar)
        opener = build_opener(cookie)
        response = opener.open(request)
        print(response.read().decode())
    
    
    if __name__ == "__main__":
        get_cookie()
        use_cookie()

     5、URLError-捕获异常:

    from urllib.request import Request,urlopen,URLError
    from fake_useragent import UserAgent
    url = "https://www.sxt.cn/index/login/login.htmls"
    headers = {
        "User-Agent":UserAgent().random
    }
    try:
        request = Request(url,headers=headers)
        response = urlopen(request)
        print(response.read().decode())
    except URLError as e:
        # HTTP Error 404: Not Found
        # <urlopen error [Errno 11004] getaddrinfo failed>
        if e.args==():    # 不存在url    404
            print(e.code)
        else:             # 域名不存在   11004
            print(e.args[0].errno)
    print("成功")

    6、requests库:

    1)、get请求:
    import requests
    url = "http://www.baidu.com/s"
    params = {
        "wd":"尚学堂"
    }
    response = requests.get(url,params=params)
    print(response.text)
    
    
    2)、post请求 :
    import requests
    url = "https://www.sxt.cn/index/login/login"
    data = {
        "user": "17703181473",
        "password": "123456",
    }
    # data接受post数据
    response = requests.post(url,data=data)
    print(response.text)
    
    3)、代理:
    import requests
    from fake_useragent import UserAgent
    url = "http://httpbin.org/get"
    headers = {
        "User-Agent":UserAgent().random
    }
    proxy = {
        # 书写格式
        "http":"http://124.235.135.210:80"
        # 独享代理
        # "http":"http://user:pwd@ip:port"
    }
    # data接受post数据
    response = requests.get(url,headers=headers,proxies=proxy)
    print(response.text)
    
    4)、证书:
    import requests
    from fake_useragent import UserAgent
    url = "https://www.12306.cn/index/"
    headers = {
        "User-Agent":UserAgent().random
    }
    # 去掉红色的警告
    requests.packages.urllib3.disable_warnings()
    # verify= False 不需要证书
    response = requests.get(url,headers=headers,verify=False)
    response.encoding="utf-8"   # 设置编码
    print(response.text)
    
    5)、cookies:
    import requests
    from fake_useragent import UserAgent
    session = requests.session()  # 发送请求用这个
    headers = {
        "User-Agent":UserAgent().random
    }
    login_url = "https://www.sxt.cn/index/login/login"
    data = {
        "user": "17703181473",
        "password": "123456",
    }
    response = session.post(login_url,headers=headers,data=data)
    url = "https://www.sxt.cn/index/user.html"
    response = session.get(url,headers=headers)    # session中有cookie
    print(response.text)

    7、re模块:

    import re
    str = "Iss study Python3.6 Everyday"
    ################## 1、match####################
    result = re.match("I",str)
    print(result.group())  # I
    result = re.match("w",str)
    print(result.group())  # I
    result = re.match(".",str)
    print(result.group())  # I
    result = re.match("D",str)  # D 非数字
    print(result.group())  # I
    
    result = re.match("i",str,re.I)  # re.I忽略大小写
    print(result.group())  # I
    result = re.match("S",str,re.I)  # S非空格
    print(result.group())  # I
    
    result = re.match("Study",str,re.I)  # S非空格
    # print(result.group())  # None 匹配不到
    
    ###############2、search 搜索一个###############
    str = "Iss study Python3.6 Everyday study"
    result2= re.search("study",str)
    print(result2.group()) # study
    result2= re.search("stw+",str)
    print(result2.group()) # study
    result2= re.search("Pw+.d",str)
    print(result2.group()) # Python3.6
    
    ###############3、finall 找所有内容################
    str = "Iss study Python3.6 Everyday"
    result3 = re.findall("y",str)
    print(result3)           # [y,y,y,y] 找到所有的字符y
    str2 = "<div class='info' id='info' ><a href='http://www.sxt.com'>尚学堂ewew</a></div>"
    result3 = re.findall("<a href='http://www.sxt.com'>(.*)</a>",str2)
    print(result3)           # [尚学堂ewew]
    result3 = re.findall("<a href='http://www.sxt.com'>(.+)</a>",str2)
    print(result3)           # [尚学堂ewew]
    result3 = re.findall("<a href='http://www.sxt.com'>([u4E00-u9FA5]w+)</a>",str2)
    print(result3)            # [尚学堂ewew]
    result3 = re.findall("<a href='(.+)'>",str2)
    print(result3)           # ['http://www.sxt.com']
    
    ###############4、sub 替换 #########################
    str3 = "<div><a href='http://www.sxt.com'>尚学堂ewew</a></div>"
    result4 = re.sub('<div>(.*)</div>',r"<span>1</span>",str3)     # 记得加r
    print(result4)   # <span><a href='http://www.sxt.com'>尚学堂ewew</a></span>

     8、糗事百科案例:

    import requests,re
    from fake_useragent import UserAgent
    url = "https://www.qiushibaike.com/text/page/1/"
    response = requests.get(url)
    # s+ 是 空格
    infos = re.findall('<div class="content">s+<span>s+(.*)s+</span>',response.text)
    for info in infos:
        info = info.replace("<br/>","")      # 去除<br/>
        print(info)
        with open("duanzi.txt","a+",encoding="utf-8") as f:
            f.write(info+"
    
    
    ")

    9、BeautifulSoup4:

      

    from bs4 import BeautifulSoup
    from bs4.element import Comment
    str = '''
    <title>尚学堂</title>
    <div class='info div1' float='left'>Welcome to SXT</div>
    <div class='info' float='right'>
        <span>Good Good Study</span>
        <a href='www.bistx.cn'></a>
        <strong><!--没有用--></strong>
    </div>
    '''
    
    soup = BeautifulSoup(str,"lxml")
    ################1、获得标签######################
    print(soup.title)  # <title>尚学堂</title> 获得title标签
    print(soup.div)    # <div class="info" float="left">Welcome to SXT</div> 默认获得第一个
    
    ################2、获得标签的属性#################
    print(soup.div.attrs)  # {'class': ['info'], 'float': 'left'} 默认获得第一个div的属性
    print(soup.div.get("class")) # ['info']
    print(soup.div["class"])     # ['info']
    print(soup.a["href"])     # www.bistx.cn
    
    
    ################3 、获得文本 string:NavigableString对象,text:字符串str ############
    print(soup.div.string)        # Welcome to SXT
    print(type(soup.div.string))  # <class 'bs4.element.NavigableString'>
    print(soup.div.text)          #  Welcome to SXT
    print(type(soup.div.text))    # <class 'str'>
    
    print(soup.strong.string)                      # 没有用
    print(type(soup.strong.string))                # <class 'bs4.element.Comment'>  注释是Comment对象
    print(soup.strong.text,type(soup.strong.text)) # ""  <class 'str'>
    
    if type(soup.strong.string) == Comment:    # 若是Comment对象(注释)
        print(soup.strong.string)           # 没有用
        print(soup.strong.prettify())       # <strong><!--没有用--></strong>
    else:
        print(soup.strong.text)
    
    ################4 、find_all 和 find 获得标签对象、通过属性获得标签对象#############
    
    print(soup.find_all("title"))  # [<title>尚学堂</title>]  获得所有的title标签
    print(soup.find_all("div"))    # 获得两个div  <div class='info' ... </div> <div>....</div>
    print(soup.find_all(float="left")) # [<div class="info div1" float="left">Welcome to SXT</div>]
    # class 是关键字,所以加_
    print(soup.find_all(class_="div1")) # [<div class="info div1" float="left">Welcome to SXT</div>]
    print(soup.find_all("div",attrs={"float":"left"})) # [<div class="info div1" float="left">Welcome to SXT</div>]
    
    ################5、select 选择标签 ############

    10、etree之xpath:

    # 下载xpath谷歌插件:用于写xpath语句,不用每次运行代码测试
    import requests
    from lxml import etree
    
    url = "https://www.qidian.com/rank/yuepiao?style=1"
    response = requests.get(url)
    e = etree.HTML(response.text)
    bookNames = e.xpath("//h4/a/text()")
    authors = e.xpath("//p[@class='author']/a[1]/text()")
    
    # for num in range(len(bookNames)):
    #     print(bookNames[num]+":"+authors[num])
    # 这个方法也可以同时输出书名和作者
    for name,author in zip(bookNames,authors):
        print(name+":"+author)
    
    # 1、在scrapy中用到xpath:
     item_list= response.xpath("
    //div[@id='content-list']/div[@class='item']")
    
    # 2、beautifulsoup用到find和find_all: 
    soup = BeautifulSoup(response1.text,"html.parser")
    div = soup.find(name="div",attrs={"id":"auto-channel-lazyload-article"})
    print(soup.find_all(class_="div1"))
    
    # 3、re用到findall:
    result3 = re.findall("<a href='http://www.sxt.com'>(.*)</a>",str2)
    print(result3)            # [尚学堂ewew]

    11、数据提取之pyquery:

    pip3 install pyquery
    
    from pyquery import PyQuery as pq
    import requests
    from fake_useragent import UserAgent
    url="http://www.xicidaili.com/nn"
    headers={
        "User-Agent":UserAgent().random
    }
    response = requests.get(url,headers=headers)
    doc =  pq(response.text)
    trs = doc("#ip_list tr")  #  找id为ip_list标签下的所有tr
    for num in range(1,len(trs)):
        ip = trs.eq(num).find("td").eq(1).text() # 注意是eq,不是[num]
        port = trs.eq(num).find("td").eq(2).text()
        type = trs.eq(num).find("td").eq(5).text()
        print(ip+":"+port+":"+type)

    12、数据提取之jsonpath:

      1)、json的dumps和load函数:

    import json
    str = '{"name":"无名之辈"}'
    ################1、loads : 将json字符串转换为#json对象#############
    obj = json.loads(str)
    print(obj,type(obj))    # {'name': '无名之辈'} <class 'dict'>
    
    ################2、dumps : 将json对象转换为json字符串##############
    result2 = json.dumps(obj,ensure_ascii=False)
    print(result2,type(result2))  # {"name": "无名之辈"} <class 'str'>
    
    ##############3、dump 将json对象存进文件中,编码需要的##################
    json.dump(obj,open("movie.txt","w",encoding="utf-8"),ensure_ascii=False)
    
    ##############3、dump 将文件中的json字符串转换为json对象,编码需要#########
    str3 = json.load(open("movie.txt",encoding="utf-8"))
    print(str3,type(str3))   # {'name': '无名之辈'} <class 'dict'>

       2)、JsonPath:

        安装模块:pip3 install JsonPath

    from jsonpath import jsonpath
    import requests,json
    # 拉勾网的所有市的信息
    url="https://www.lagou.com/lbs/getAllCitySearchLabels.json"
    response = requests.get(url)
    
    names = jsonpath(json.loads(response.text),"$..name") # 必须加$
    codes = jsonpath(json.loads(response.text),"$..code")
    
    for code,name in zip(codes,names):
        print(code+":"+name)
    
    补充: response.text、response.content与response.json:
    1、response.json:返回的是json格式数据; 2、返回的数据类型 : response.text :返回的是一个 unicode 型的文本数据; response.content :返回的是 bytes 型的二进制数据 ; 也就是说如果想取文本数据可以通过response.text 如果想取图片,文件,则可以通过 response.content; 3、数据编码 : response.content :返回的是二进制响应内容 ; response.text : 则是默认”iso-8859-1”编码,服务器不指定的话是根据网页的响应来猜测编码。
  • 相关阅读:
    第二类斯特林数学习笔记
    [ZJOI2017]树状数组
    「LibreOJ Round #6」花火
    [Ynoi2016]这是我自己的发明 莫队
    codeforces706E
    扩展CRT
    PKUSC2018游记
    「PKUWC 2018」Minimax
    「SHOI2015」(LOJ2038)超能粒子炮・改
    Codeforces712E
  • 原文地址:https://www.cnblogs.com/fangsheng/p/10310584.html
Copyright © 2020-2023  润新知