• Python爬虫快速上手教程


    1 这个是什么

           整理Python中requests常用的API

    2 代码

    from bs4 import BeautifulSoup
    import requests
    import re
    
    # 使用requests进行网络求解,用BeautifulSoup处理html网页
    s_url = "https://www.baidu.com"
    o_header = {
            'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'en-US,en;q=0.8',
            'Connection': 'keep-alive',
            'Referer': 'https://www.baidu.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    } 
    n_timeout = 36 # 设置超时秒数
    
    
    '''
    处理资源文件, url返回一个图片、音频、视频
    '''
    s_local_url = "img.jpg"
    resp_stream = requests.get(s_local_url, stream=True, headers=o_header, timeout=n_timeout) #进行图片下载   
    
    with open(s_local_url, 'wb', buffering = 1024) as fp:
        fp.write(resp_stream.content)
        fp.flush()          
    
    '''
    处理html, url返回一个网页
    '''
    # 使用get命令得到请求
    resp = requests.get(s_url, headers=o_header, timeout= n_timeout)
    
    
    # 得到明显的编码
    resp.encoding = resp.apparent_encoding
    
    
    # 字符串html转为soup
    soup__html = BeautifulSoup(resp, "lxml")   
    
    
    # 找到所有的id属性为abc的h
    soup__h = soup__html.find("a", id="h")
    print(soup__h.text)
    
    
    # 找到所有的class属性为abc的<img>
    soup__img_s = soup__html.find("img", class_="abc")
    for soup__img in soup__img_s:
        print(soup__img["src"], soup__img.text)
    
    
    # 找到所有的abc属性为opq的a
    soup__a = soup__html.find("a", attrs= {"abc" :"opq"})
    print(soup__a.text)
    
    
    # 找到所有的abc属性为opq 1, opq 2的a (正则找)
    soup__a = soup__html.find("a", attrs= {"abc" :re.compile(r"opq(sw+)?")})
    print(soup__a.text)
    
    
  • 相关阅读:
    忍道
    2020.12.27
    2020.12.26
    2020.12.25
    记录
    卸载抖音
    汉化报告修改配置文件
    tcp校验client客户端的合法性
    tcp连接发送命令客户端接收后返回结果给服务端
    logging模块
  • 原文地址:https://www.cnblogs.com/Kalafinaian/p/11440996.html
Copyright © 2020-2023  润新知