• python简单爬虫


    python简单爬虫

    爬虫文字(1)

    # requests库
    ## requests.get(url)  模拟浏览器打开网页
    
    # re库
    
    import requests
    import re
    response = requests.get('http://ishuo.cn/')  # 模拟浏览器打开网页
    # print(response.status_code)  # 200成功,301,404网页丢失
    # print(response.encoding)  # utf-8
    data = response.text  #
    # print(data)
    # .匹配所有字符,*表示前面的字符0到无穷个
    content_res = re.findall('<div class="content">(.*?)</div>', data)
    title_res = re.findall('<a href="/subject/.*?">(.*?)</a>', data)
    # print(title_res.index('活得糊涂的人,容易幸福'))
    # print(title_res.index('购买银行理财产品亏损后如何起诉'))
    title_res= title_res[10:60]
    # print(title_res)
    title_content_dic = {}
    for i in range(len(title_res)):
        title_content_dic[title_res[i]] = content_res[i]
        # print(title_content_dic)
    
    # print(title_content_dic)
    for i in title_content_dic.items():
        # print(str(i)+'
    ')
        print(f'{i[0]:<40} | {i[1]}')
    
    

    爬虫文字(2)

    import requests
    import re
    
    response = requests.get('http://ishuo.cn/')  # 模拟浏览器打开网页
    
    data = response.text
    
    res = re.findall('<li class="list_li">(.*?)</li>',data)
    
    
    title_content_desc_dic = {}
    for i in res:
        content = re.findall('<div class="content">(.*?)</div>',i)[0]
    
        title = re.findall('<a href="/subject/.*?">(.*?)</a>',i)[0]
    
        desc = re.findall('</a>(04月.*?)</div>',i)[0]
    
    
        title_content_desc_dic[title] = (content,desc)
    
    for i in title_content_desc_dic.items():
        print(f'{i[0]:<40} | {i[1]}')
    

    爬虫图片

    import requests
    import re
    
    response = requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1&tdsourcetag=s_pcqq_aiomsg')
    data = response.text
    # print(data)
    
    img_url_res = re.findall('data-src="(.*?)"',data)
    for i in img_url_res:
        img_response = requests.get(i)
        img_data = img_response.content
        img_name = i.split('/')[-1]
        f=open(img_name,'wb')
        f.write(img_data)
        # f.flush()  # 快速刷新
    

    爬虫视频

    import requests
    import re
    
    response = requests.get('http://www.mod.gov.cn/v/index.htm')
    # response.encoding = 'utf8'
    data = response.text
    # print(data)
    
    # mp4_res1 = re.findall('<a href="(.*?)"  class="img">',data)
    # for i in mp4_res1:
    #     print(i)
    
    
    mp4_res2 = re.findall('<a href="(.*?)">', data)
    
    for i in mp4_res2:  # type:str
        res = re.findall('(.*?htm)', i)[0]
        res = 'http://www.mod.gov.cn/v/' + res
    
        response = requests.get(res)
        data = response.text
        # http://vv.chinamil.com.cn/asset/category3/2019/06/27/asset_357593.mp4
        url_res = re.findall('//Video (.*?.mp4)',data)[0]
    
    
        mp4_response = requests.get(url_res)
        mp4_data = mp4_response.content
        f = open('test.mp4','wb')
        f.write(mp4_data)
        # break
    
    '''
    <a href="2019-07/20/content_4846213.htm" class="img"><img src="attachement/jpg/site21/20190720/6c4b9041ab8b1e9ca1be01.jpg" border="0"><em class="video_40x40"></em></a>
    '''
    
    
  • 相关阅读:
    poj1088 经典dp
    poj2301
    poj1050(nyoj104 zoj1074)dp问题
    hdu1003
    poj1001(高精度)
    图的深度优先遍历DFS
    jquery中attr和prop的区别
    Apache 配置域名入口路径
    关于启动定时器和取消定时器的问题
    Web攻防之XSS,CSRF,SQL注入
  • 原文地址:https://www.cnblogs.com/SkyOceanchen/p/11225291.html
Copyright © 2020-2023  润新知