• 最基础的爬虫


    #1、引入模块
    from urllib import request


    #2、操作
    #(1)定义目标url
    base_url = "http://www.langlang2017.com/index.html"


    #请求头部---request headers

    headers = {
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    #"Accept-Encoding":"gzip, deflate", #一定不要添加,就算添加了也要注释掉
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
    }
    req = request.Request(base_url,headers=headers) #生成一个带headers的request对象
    #说明:
    #a.url
    #b.data:(默认空)是伴随新势力提交的数据(比如要post的数据),同时http请求将从GET方式改成“POST”方式。
    #c.headers:(默认空),字典类型,包含了需要发送到http报头的键值对。
    #c.1 User-Agent:表示浏览器的身份
    #历史:netscape(网景)VS IE,网景就凉凉了,----网景编程人员去Mozilla(开源了)

    #添加更多的header信息
    req.add_header("Connection","keep-alive")

    #获取header信息
    print(req.get_header("Connection"))




    #使用urllib库,将langlang2017全站网页请求并保存
    #1、引入模块
    from urllib import request
    from urllib import error



    #2、操作
    #(1)创建url
    base_url = "http://www.langlang2017.com/route.html"

    try:
    # (2)请求url
    reponse = request.urlopen(base_url,timeout=0.02)

    # (3)读取内容
    html = reponse.read()

    # (4)转码
    html = html.decode("utf-8")
    # (5)保存
    with open("route.html", "w", encoding="utf-8") as f:
    f.write(html)

    except error.URLError as e:
    print(e)
  • 相关阅读:
    废水回收
    XJOI网上同步训练DAY6 T2
    XJOI网上同步训练DAY6 T1
    Codeforces 351B Jeff and Furik
    对拍 For Linux
    Codeforces 432D Prefixes and Suffixes
    Codeforces 479E Riding in a Lift
    Codeforces 455B A Lot of Games
    Codeforces 148D Bag of mice
    Codeforces 219D Choosing Capital for Treeland
  • 原文地址:https://www.cnblogs.com/cuihengyue/p/8714578.html
Copyright © 2020-2023  润新知