• 爬虫学习(五)——百度贴吧的爬取


    import os
    import time
    import urllib.request
    import urllib.parse


    # 输入目标页码和吧名
    def header():
    url = "https://tieba.baidu.com/f?"
    baming = input("请输入要爬取的吧名")
    start_page = int(input("请输入起始页"))
    end_page = int(input("请输入结束页"))
    # 对目标页码进行爬取
    for page in range(start_page,end_page+1):
    print("正在爬取第%s页"%page)
    request = headle_request(page,url,baming)
    download(request,baming,page)
    # 设置时间间隔,防止网站识别为恶意攻击
    time.sleep(2)


    # 构建请求对象,拼接url
    def headle_request(page,url,baming):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
    pn = (page-1)*50
    data = {
    "kw": baming,
    "ie": "utf8",
    "pn": pn
    }
    data = urllib.parse.urlencode( data )
    url += data
    request = urllib.request.Request( url, headers=headers )
    return request


    # 根据请求对象下载指定的目标数据
    def download(request,baming,page):
    # 获取响应数据
    response = urllib.request.urlopen(request)
    # 创建文件存储的文件夹
    if not os.path.exists(baming):
    os.mkdir(baming)
    # 拼接文件名
    filename =baming+"第%s页"%page+".html"
    print(filename)
    # 拼接文件路径
    filepath = os.path.join(baming,filename)
    # 将相应的数据存储起来
    with open(filepath,"wb") as tf:
    tf.write(response.read())
    if __name__ == '__main__':
    header()



  • 相关阅读:
    [C++] socket
    [C++] socket
    2014-3-16 星期天 晴[改变生活规律,稳中求进]
    [C++] socket
    [C++] socket
    [ACM_水题] Yet Another Story of Rock-paper-scissors [超水 剪刀石头布]
    easyui datagrid如何获取到每行的文本框
    JS传递数组到后台
    如何将js的object对象传到后台--->JavaScript之对象序列化
    EasyUI常用控件禁用方法
  • 原文地址:https://www.cnblogs.com/kuangkuangduangduang/p/10369636.html
Copyright © 2020-2023  润新知