• 爬虫知识随记


    备忘

     1 import urllib.parse
     2 import urllib.request
     3 #  将数据使用urlencode编码处理后,再使用encoding设置为utf-8编码
     4 data = bytes(urllib.parse.urlencode({"word":"hello"}), encoding="utf-8")
     5 response = urllib.request.urlopen("http://httpbin.org/post",data=data)    # 打开指定需要爬取的网页
     6 html = response.read()    # 读取网页
     7 print(html)    # 打印读取内容
     8 
     9 import requests
    10 data = {"word":"hello"}    # 表单参数
    11 response = requests.post("http://httpbin.org/post",data=data)
    12 print("状态码:",response.status_code)
    13 # print("请求url:",response.url)
    14 # print("头部信息:",response.headers)
    15 # print("cookies:",response.cookies)
    16 # print("文本形式的网页源码:",response.text)
    17 print("字节流形式的网页源码:",response.content)
    18 
    19 import requests
    20 url = "https://book.douban.com//tag/营销"    # 网络请求地址
    21 headers = {"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit"
    22                         "/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36"}
    23 response = requests.get(url, headers=headers)    # 发送网络请求
    24 print(response.content.decode("utf-8"))    # 以字节流形式打印网页源码
    25 
    26 import requests
    27 # 导入requests.exceptions中的三种异常类
    28 from requests.exceptions import ReadTimeout,HTTPError,RequestException
    29 # 循环发送请求50次
    30 for a in range(0,50):
    31     try:    # 捕获异常
    32         url = "https://www.baidu.com/"    # 网页请求地址
    33         response = requests.get(url, timeout=0.06)    # 设置超时为0.06s
    34         print(response.status_code)    # 打印状态码
    35     except ReadTimeout:
    36         print("timeout")
    37     except HTTPError:
    38         print("httperror")
    39     except RequestException:
    40         print("reqeerror")
    41 
    42 import requests
    43 # 设置代理IP
    44 proxy = {"http://":"60.188.90.33:3000",
    45          "https://":"183.128.240.228:6666"}
    46 response = requests.get("https://www.baidu.com/", proxies = proxy)
    47 print(response.content.decode("utf-8"))
    48 
    49 import requests
    50 from bs4 import BeautifulSoup
    51 response = requests.get("http://news.baidu.com/")    # 发送网络请求
    52 bs = BeautifulSoup(response.text,"lxml")    # 通过lxml解析器来解析文本形式的网页源码
    53 print(bs.find("title").text)    # 找到title并以文本形式打印
    54 
    55 # 任务:将爬取的内容写到本地
    56 import requests
    57 url = "https://www.bilibili.com/"
    58 def use_requests(url):
    59     response = requests.get(url)
    60     # print(response.text)
    61     file_path = "E:/PyCharmProjects/SpyderStudy/哔哩哔哩首页.html"
    62     with open(file_path,"w",encoding="utf-8") as f:
    63         f.write(response.text)
    64 if __name__ == "__main__":
    65     use_requests(url)
    66 
    67 
    68 file_path = "E:/PyCharmProjects/SpyderStudy/哔哩哔哩_selenium_chrome.html"
    69 with open(file_path,"w",encoding="utf-8") as f:
    70     f.write(data)
    71 
    72 driver.close()
    73 
    74 import urllib.request
    75 import requests
    76 from bs4 import BeautifulSoup
    77 headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 "
    78                          "(KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36"}
    79 key = "小说"
    80 key_ASCII = urllib.request.quote(key)
    81 url = "https://book.douban.com/tag/" + str(key_ASCII) + "?start=0&type=T"
    82 response = requests.get(url, headers=headers)
    83 bs = BeautifulSoup(response.text, "lxml")
    84 a = bs.select("#subject_list > ul > li:nth-child(1) > div.info > h2 > a")
    85 a_1 = a[0]
    86 print(a_1.get_text().replace("
    ", "").replace(" ", ""))
    87 
    88 import urllib.request
    89 import requests
    90 
    91 headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 "
    92                              "(KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36"}
    93 
    94 key = "华语"
    95 key_ASCII = urllib.request.quote(key)
    96 url = "https://movie.douban.com/j/search_subjects?type=movie&tag=" + str(
    97     key_ASCII) + "&sort=recommend&page_limit=20&page_start=0"
    98 response = requests.get(url, headers=headers)
    99 print(response)
  • 相关阅读:
    leetcode58. 最后一个单词的长度 🌟
    leetcode53. 最大子序和 🌟
    leetcode38. 报数 🌟
    leetcode35. 搜索插入位置 🌟
    leetcode28. 实现strStr() 🌟
    ⚠️ Python 循环列表删除元素的注意事项
    leetcode27. 移除元素 🌟
    leetcode26. 删除排序数组中的重复项 🌟
    javascript 高阶函数 currying & uncurrying
    javascript 高阶函数 实现 AOP 面向切面编程 Aspect Oriented Programming
  • 原文地址:https://www.cnblogs.com/chang2021/p/14052804.html
Copyright © 2020-2023  润新知