• 爬虫作业


    1、爬取三国演义http://www.shicimingju.com/book/sanguoyanyi.html

    代码:

    import requests
    from bs4 import BeautifulSoup
    
    res = requests.get('https://www.shicimingju.com/book/sanguoyanyi.html')
    soup = BeautifulSoup(res.text, 'lxml')
    # 先取出书名
    book_name = soup.find(class_='bookmark-list').find(name='h1').text
    # print(book_name)
    # 取出所有回合章节的url
    url_list = soup.select('.book-mulu ul li a')
    # print(url_list)
    for line in url_list:
        url = 'https://www.shicimingju.com' + line.attrs.get('href')
        # print(url)
        # 依次get回合url
        res1 = requests.get(url)
        soup1 = BeautifulSoup(res1.text, 'lxml')
        # print(soup1)
        # 取出该回合名称
        title = soup1.select('.bookmark-list h1')[0].text
        # print(title,type(title))
        # 取出该回合内容
        content = soup1.find(class_='chapter_content').text
        # print(content)
        with open('%s.txt' % book_name, 'a', encoding='utf-8') as f:
            # 追加回合标题
            f.write(title)
            # 追加回合内容
            f.write(content)

     2、爬取肯德基门店信息:http://www.kfc.com.cn/kfccda/storelist/index.aspx

    import requests
    import json
    from bs4 import BeautifulSoup
    
    data = {
        'cname': '上海',
        'pid': '',
        'keyword': '',
        'pageIndex': 1,
        'pageSize': 1000
    }
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36',
        'Referer': 'http://www.kfc.com.cn/kfccda/storelist/index.aspx',
    }
    res = requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx', params={'op': 'cname'}, data=data, headers=header)
    soup = BeautifulSoup(res.text, 'lxml')
    
    kfc_info = json.loads(res.text).get('Table1')
    kfc_list = [
        {
            "storeName":kfc.get('storeName')+'餐厅',
            "addressDetail":kfc.get("addressDetail"),
            "pro":kfc.get("pro")
        }
        for kfc in kfc_info
    ]
    
    print(kfc_list)
    print(len(kfc_list)) #455

     3、爬取拉钩网职位信息

    import requests
    
    # 实际要爬取的url
    url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
    
    payload = {
        'first': 'true',
        'pn': '1',
        'kd': 'python',
    }
    
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
        'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
        'Accept': 'application/json, text/javascript, */*; q=0.01'
    }
    # 原始的url
    urls = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
    # 建立session
    s = requests.Session()
    # 获取搜索页的cookies
    s.get(urls, headers=header, timeout=3)
    # 为此次获取的cookies
    cookie = s.cookies
    # 获取此次文本
    response = s.post(url, data=payload, headers=header, cookies=cookie, params={'city': '上海'}, timeout=5).text
    print(response)
  • 相关阅读:
    判断是否为数字
    viewPage
    向左对齐的Gallery
    QQ登入(6)腾讯微博-获取微博用户信息,发送微博
    QQ登入(5)获取空间相册,新建相册,上传图片到空间相册
    QQ登入(4)QQ分享-内容转载
    QQ登入(3)QQ空间分享-无需登入
    Codeforces Round #210
    zoj 3716
    Codeforces Round #209 (Div. 2)
  • 原文地址:https://www.cnblogs.com/baicai37/p/13429806.html
Copyright © 2020-2023  润新知