python简单爬虫

python简单爬虫

python简单爬虫

爬虫文字（1）

# requests库
## requests.get(url)  模拟浏览器打开网页

# re库

import requests
import re
response = requests.get('http://ishuo.cn/')  # 模拟浏览器打开网页
# print(response.status_code)  # 200成功,301,404网页丢失
# print(response.encoding)  # utf-8
data = response.text  #
# print(data)
# .匹配所有字符,*表示前面的字符0到无穷个
content_res = re.findall('<div class="content">(.*?)</div>', data)
title_res = re.findall('<a href="/subject/.*?">(.*?)</a>', data)
# print(title_res.index('活得糊涂的人，容易幸福'))
# print(title_res.index('购买银行理财产品亏损后如何起诉'))
title_res= title_res[10:60]
# print(title_res)
title_content_dic = {}
for i in range(len(title_res)):
    title_content_dic[title_res[i]] = content_res[i]
    # print(title_content_dic)

# print(title_content_dic)
for i in title_content_dic.items():
    # print(str(i)+'
')
    print(f'{i[0]:<40} | {i[1]}')

爬虫文字（2）

import requests
import re

response = requests.get('http://ishuo.cn/')  # 模拟浏览器打开网页

data = response.text

res = re.findall('<li class="list_li">(.*?)</li>',data)


title_content_desc_dic = {}
for i in res:
    content = re.findall('<div class="content">(.*?)</div>',i)[0]

    title = re.findall('<a href="/subject/.*?">(.*?)</a>',i)[0]

    desc = re.findall('</a>(04月.*?)</div>',i)[0]


    title_content_desc_dic[title] = (content,desc)

for i in title_content_desc_dic.items():
    print(f'{i[0]:<40} | {i[1]}')

爬虫图片

import requests
import re

response = requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1&tdsourcetag=s_pcqq_aiomsg')
data = response.text
# print(data)

img_url_res = re.findall('data-src="(.*?)"',data)
for i in img_url_res:
    img_response = requests.get(i)
    img_data = img_response.content
    img_name = i.split('/')[-1]
    f=open(img_name,'wb')
    f.write(img_data)
    # f.flush()  # 快速刷新

爬虫视频

import requests
import re

response = requests.get('http://www.mod.gov.cn/v/index.htm')
# response.encoding = 'utf8'
data = response.text
# print(data)

# mp4_res1 = re.findall('<a href="(.*?)"  class="img">',data)
# for i in mp4_res1:
#     print(i)


mp4_res2 = re.findall('<a href="(.*?)">', data)

for i in mp4_res2:  # type:str
    res = re.findall('(.*?htm)', i)[0]
    res = 'http://www.mod.gov.cn/v/' + res

    response = requests.get(res)
    data = response.text
    # http://vv.chinamil.com.cn/asset/category3/2019/06/27/asset_357593.mp4
    url_res = re.findall('//Video (.*?.mp4)',data)[0]


    mp4_response = requests.get(url_res)
    mp4_data = mp4_response.content
    f = open('test.mp4','wb')
    f.write(mp4_data)
    # break

'''
<a href="2019-07/20/content_4846213.htm" class="img"><img src="attachement/jpg/site21/20190720/6c4b9041ab8b1e9ca1be01.jpg" border="0"><em class="video_40x40"></em></a>
'''

相关阅读:
poj1088 经典dp
poj2301
poj1050(nyoj104 zoj1074)dp问题
hdu1003
poj1001（高精度）
图的深度优先遍历DFS
jquery中attr和prop的区别
Apache 配置域名入口路径
关于启动定时器和取消定时器的问题
Web攻防之XSS,CSRF,SQL注入

原文地址：https://www.cnblogs.com/SkyOceanchen/p/11225291.html